## Cleaning and preparing the data for model training

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
df_train = pd.read_csv('train.csv')

In [None]:
df_train.head()

In [None]:
df_train.isnull().sum()

In [None]:
df_test=pd.read_csv('test.csv')

In [None]:
df_test.head()

In [None]:
df = pd.concat([df_train, df_test], ignore_index=True)
df.head()

In [None]:
## Basic code :
df.info()

In [None]:
df.describe()

In [None]:
df.drop(['User_ID'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
## Data Preprocessing

## Handling categorical feature Gender
df['Gender']=df['Gender'].map({'F':0, 'M':1})
df.head()

In [None]:
## Handling categorical feature Age
df['Age'].unique()

In [None]:
df['Age']=df['Age'].map({'0-17':1, '18-25':2,'26-35':3, '36-45':4,'46-50':5,'51-55':6,'55+':7})

In [None]:
df.head()

In [None]:
## Handling categorical feature City_Category
df_city=pd.get_dummies(df['City_Category'],drop_first=True)

df_city = df_city.astype(int)
df_city.head()

In [None]:
df=pd.concat([df,df_city], axis=1)
df.head()

In [None]:
df.drop('City_Category', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

## Focus on replacing missing values :
Product Category_2 et 3

In [None]:
df['Product_Category_2'].value_counts()

In [None]:
##Replace the missing values with mode because its discrete values
df['Product_Category_2']=df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0])


In [None]:
df['Product_Category_2'].value_counts()

In [None]:
df['Product_Category_2'].isnull().sum()

In [None]:
df['Product_Category_3']=df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0])


In [None]:
df['Product_Category_3'].value_counts()

In [None]:
df.shape

In [None]:
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].str.replace('+','')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
## Convert object into integers
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].astype(int)

In [None]:
df.info()

In [None]:
sns.barplot(x='Age', y='Purchase' , hue='Gender', data=df)

In [None]:
sns.barplot(x='Occupation', y='Purchase' , hue='Gender', data=df)

In [None]:
sns.barplot(x='Product_Category_1', y='Purchase' , hue='Gender', data=df)

In [None]:
sns.barplot(x='Product_Category_2', y='Purchase' , hue='Gender', data=df)

In [None]:
df.head()

In [None]:
df['Product_Category_2'].describe()

In [None]:
sns.barplot(x='Product_Category_3', y='Purchase' , hue='Gender', data=df)

## Split Train and Test data

In [None]:
df_test=df[df['Purchase'].isnull()]

In [None]:
df_train = df[~df['Purchase'].isnull()]


In [73]:
X = df_train.drop('Purchase', axis=1)

In [None]:
y = df_train['Purchase']


In [76]:
from sklearn.model_selection import train_test_split

# Assuming X is your feature matrix and y is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [77]:
X_train.drop('Product_ID', axis=1, inplace=True)
X_test.drop('Product_ID', axis=1, inplace=True)


In [78]:
## Feature Scaling
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train= sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
## Train our model
