## Clean datasets

In [95]:
import pandas as pd

#### Clean User Dataset

    - user_id
    - age
    - gender
    - occupation
    - zip_code

In [96]:
users = pd.read_csv("../books_data/users.csv", sep=";", encoding="latin-1")

In [97]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [98]:
# Clean the Age column
users.drop(users[(users.Age >=100)].index, inplace=True)
m = users['Age'].mean()
users["Age"] = users["Age"].fillna(m)
users["Age"] = users["Age"].astype(int)

In [99]:
# split la colonne location en 3 colonnes : city, state, country
transition=users.Location.str.split(pat=', ',expand=True)
transition=transition.drop([3,4,5,6,7,8], axis=1)
transition.columns = ['City', 'State', 'Country']
transition['Country']=transition['Country'].str.upper()

In [100]:
users = pd.concat([users['User-ID'],transition['Country'],users['Age']], axis=1)

In [101]:
users.head()

Unnamed: 0,User-ID,Country,Age
0,1,USA,34
1,2,USA,18
2,3,RUSSIA,34
3,4,PORTUGAL,17
4,5,UNITED KINGDOM,34


In [102]:
import pycountry

# create a list of valid country names using pycountry
valid_countries = [country.name for country in pycountry.countries]

# define a function to clean country names
def clean_country(country):
    # remove any leading or trailing whitespace
    country = str(country).strip()
    # convert to title case
    country = country.title()
    # replace common misspellings or abbreviations
    if country == "Usa":
        country = "United States"
    # check if the cleaned name is in the list of valid countries
    if country in valid_countries:
        return country
    else:
        return ''

In [103]:
users["Country"] = users["Country"].apply(clean_country)
users.dropna(inplace=True)

In [104]:
users.head()

Unnamed: 0,User-ID,Country,Age
0,1,United States,34
1,2,United States,18
2,3,,34
3,4,Portugal,17
4,5,United Kingdom,34


In [105]:
import numpy as np

In [106]:
users['gender'] = np.random.choice(['F', 'M'], size=len(users))
my_list = ['administrator', 'artist', 'doctor', 'educator', 'engineer', 'entertainment', 'executive', 'healthcare', 'homemaker', 'lawyer', 'librarian', 'marketing', 'none', 'other', 'programmer', 'retired', 'salesman', 'scientist', 'student', 'technician', 'writer']

users['occupation'] = np.random.choice(my_list, size=len(users))

In [107]:
users["Country"] = np.random.choice(["00","11","21","31","41","51","61","71","81","91"], size=len(users))
users["Country"] = users["Country"].astype(str)

In [108]:
users.head()

Unnamed: 0,User-ID,Country,Age,gender,occupation
0,1,7,34,F,executive
1,2,1,18,M,administrator
2,3,7,34,F,none
3,4,4,17,F,none
4,5,3,34,M,retired


In [109]:
users["Country"][1][0]

'1'

In [110]:
users = users[["User-ID", "Age", "gender", "occupation", "Country"]]

In [111]:
users.head()

Unnamed: 0,User-ID,Age,gender,occupation,Country
0,1,34,F,executive,7
1,2,18,M,administrator,1
2,3,34,F,none,7
3,4,17,F,none,4
4,5,34,M,retired,3


In [112]:
users.to_csv("../Data2/u.user",sep='|',encoding='latin-1',header=False,index=False)

#### Clean Books dataset

In [None]:
df_books = pd.read_csv("../books_data/Books.csv")
df_books.head(3)

  df_books = pd.read_csv("Data/Books.csv")


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [None]:
df_books = df_books.drop(columns = ['Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'])
df_books.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication
0,195153448,Classical Mythology,Mark P. O. Morford,2002
1,2005018,Clara Callan,Richard Bruce Wright,2001
2,60973129,Decision in Normandy,Carlo D'Este,1991


In [None]:
import pandas as pd
import numpy as np

# Remplacez cette ligne par le chargement de votre fichier csv df_books
# df_books = pd.read_csv('your_csv_file.csv')

categories = ['unknown','Action','Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War' ,'Western']

# Ajouter les colonnes avec des valeurs 0 par défaut
for cat in categories:
    df_books[cat] = 0

# Fonction pour assigner aléatoirement des valeurs 0 ou 1 aux catégories
def assign_random_categories(row):
    num_categories = np.random.randint(0, 4)  # générer un nombre aléatoire entre 0 et 3
    if num_categories == 0:
        row['unknown'] = 1
    else:
        chosen_categories = np.random.choice(categories[1:], num_categories, replace=False)
        for cat in chosen_categories:
            row[cat] = 1
    return row

# Appliquer la fonction aux lignes du DataFrame
df_books = df_books.apply(assign_random_categories, axis=1)



In [None]:
df_books.head(10)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,unknown,Action,Adventure,Animation,Childrens,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0060973129,Decision in Normandy,Carlo D'Este,1991,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
7,0671870432,PLEADING GUILTY,Scott Turow,1993,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0679425608,Under the Black Flag: The Romance and the Real...,David Cordingly,1996,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_books = df_books.drop(columns = ['Book-Author'])

In [None]:
df_books.to_csv("u.books", sep='|', header=True, index=False, encoding='latin-1')

#### Clean Ratings Dataset

In [None]:
ratings = pd.read_csv("../books_data/ratings.csv")

In [None]:
ratings.to_csv("u.data", sep=' ', header=True, index=False, encoding='latin-1')