In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

#### Importing the dataset:

In [4]:
df = pd.read_excel("dataset_revisedVersion.xlsx")

In [5]:
df.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,3.7345,522.83,1/5/2019,13:08:00,Ewallet,110.429824,78.878445,412.400176,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,0.764,76.4,3/8/2019,10:29:00,Cash,43.310658,43.310658,33.089342,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,2.3165,324.31,3/3/2019,13:23:00,Credit card,97.786823,69.847731,226.523177,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,2.911,465.76,1/27/2019,20:33:00,Ewallet,119.089747,74.431092,346.670253,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,4.3155,604.17,2/8/2019,10:37:00,Ewallet,113.661932,81.187094,490.508068,5.3


#### Dropping the columns we don't need :

In [6]:
df.drop(['Invoice ID','Branch','gross income'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,Rating
0,Yangon,Member,Female,Health and beauty,74.69,7,3.7345,522.83,1/5/2019,13:08:00,Ewallet,110.429824,78.878445,9.1
1,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,0.764,76.4,3/8/2019,10:29:00,Cash,43.310658,43.310658,9.6
2,Yangon,Normal,Male,Home and lifestyle,46.33,7,2.3165,324.31,3/3/2019,13:23:00,Credit card,97.786823,69.847731,7.4
3,Yangon,Member,Male,Health and beauty,58.22,8,2.911,465.76,1/27/2019,20:33:00,Ewallet,119.089747,74.431092,8.4
4,Yangon,Normal,Male,Sports and travel,86.31,7,4.3155,604.17,2/8/2019,10:37:00,Ewallet,113.661932,81.187094,5.3


## Data transformation:

#### Transforming the date to an ordinal number (number of days):

In [10]:
df['Date'] = pd.to_datetime(df['Date']) 
df['Date']=df['Date'].map(dt.datetime.toordinal)

#### Transforming the time to an ordinal number (number of minutes):

In [11]:
df['Hour'] = df['Time'].apply(lambda x: x.hour)
df['Minute'] = df['Time'].apply(lambda x: x.minute)

df['Minutes_Since_Midnight'] = df['Hour'] * 60 + df['Minute']
df.drop(['Hour','Minute','Time'],inplace=True,axis=1)

In [12]:
df.head()

Unnamed: 0,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Payment,cogs,gross margin percentage,Rating,Minutes_Since_Midnight
0,Yangon,Member,Female,Health and beauty,74.69,7,3.7345,522.83,737064,Ewallet,110.429824,78.878445,9.1,788
1,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,0.764,76.4,737126,Cash,43.310658,43.310658,9.6,629
2,Yangon,Normal,Male,Home and lifestyle,46.33,7,2.3165,324.31,737121,Credit card,97.786823,69.847731,7.4,803
3,Yangon,Member,Male,Health and beauty,58.22,8,2.911,465.76,737086,Ewallet,119.089747,74.431092,8.4,1233
4,Yangon,Normal,Male,Sports and travel,86.31,7,4.3155,604.17,737098,Ewallet,113.661932,81.187094,5.3,637


#### Ordinal Label Encoding:

In [13]:
le=LabelEncoder()
list_cate = ['City', 'Customer type', 'Gender', 'Product line', 'Payment']
for i in list_cate:
    df[i]=le.fit_transform(df[i])

In [14]:
df.head()

Unnamed: 0,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Payment,cogs,gross margin percentage,Rating,Minutes_Since_Midnight
0,2,0,0,3,74.69,7,3.7345,522.83,737064,2,110.429824,78.878445,9.1,788
1,1,1,0,0,15.28,5,0.764,76.4,737126,0,43.310658,43.310658,9.6,629
2,2,1,1,4,46.33,7,2.3165,324.31,737121,1,97.786823,69.847731,7.4,803
3,2,0,1,3,58.22,8,2.911,465.76,737086,2,119.089747,74.431092,8.4,1233
4,2,1,1,5,86.31,7,4.3155,604.17,737098,2,113.661932,81.187094,5.3,637


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   City                     1000 non-null   int32  
 1   Customer type            1000 non-null   int32  
 2   Gender                   1000 non-null   int32  
 3   Product line             1000 non-null   int32  
 4   Unit price               1000 non-null   float64
 5   Quantity                 1000 non-null   int64  
 6   Tax 5%                   1000 non-null   float64
 7   Total                    1000 non-null   float64
 8   Date                     1000 non-null   int64  
 9   Payment                  1000 non-null   int32  
 10  cogs                     1000 non-null   float64
 11  gross margin percentage  1000 non-null   float64
 12  Rating                   1000 non-null   float64
 13  Minutes_Since_Midnight   1000 non-null   int64  
dtypes: float64(6), int32(5), 

## Model building:

### Defining the target variable

In [45]:
y=df['Gender']
X=df.drop('Gender',axis=1)

### Splitting the dataset into training and testing sets (80% for training and 20% for testing)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Classification

#### Linear SVC

In [48]:
model=LinearSVC()
model.fit(X_train, y_train)

##### Training / Testing score

In [49]:
print(f"Train score: {round(model.score(X_train, y_train), 4) * 100} % \nTest score: {round(model.score(X_test, y_test), 4) * 100} %")

Train score: 50.62 % 
Test score: 48.0 %


##### Data normalization

In [50]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

# Creation of a pipeline to normalize data and train the model
pipeline = make_pipeline(StandardScaler(), LinearSVC())

# Model training
pipeline.fit(X_train, y_train)

# Evaluation
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"Train score: {round(train_score, 4) * 100} % \nTest score: {round(test_score, 4) * 100} %")


Train score: 55.379999999999995 % 
Test score: 46.5 %


##### Cross Validation

In [51]:
from sklearn.model_selection import cross_val_score

# Cross validation with 5 folds
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")


Cross-validation scores: [0.5     0.55    0.46875 0.55625 0.54375]
Mean cross-validation score: 0.52375


##### Hyperparameter Tunning

In [52]:
from sklearn.model_selection import GridSearchCV

param_grid = {'linearsvc__C': [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

# Evaluation
best_model = grid_search.best_estimator_
train_score = best_model.score(X_train, y_train)
test_score = best_model.score(X_test, y_test)

print(f"Train score: {round(train_score, 4) * 100} % \nTest score: {round(test_score, 4) * 100} %")


Best parameters: {'linearsvc__C': 1}
Best cross-validation score: 0.52375
Train score: 55.379999999999995 % 
Test score: 46.5 %


#### KNeighborsClassifier

In [53]:
model=KNeighborsClassifier()
model.fit(X_train, y_train)

##### Training / Testing score

In [54]:
print(f"Train score: {round(model.score(X_train, y_train), 4) * 100} % \nTest score: {round(model.score(X_test, y_test), 4) * 100} %")

Train score: 69.38 % 
Test score: 47.5 %


#### Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

train_score_rf = model_rf.score(X_train, y_train)
test_score_rf = model_rf.score(X_test, y_test)

print(f"Train score (Random Forest): {round(train_score_rf, 4) * 100} % \nTest score (Random Forest): {round(test_score_rf, 4) * 100} %")


Train score (Random Forest): 100.0 % 
Test score (Random Forest): 48.5 %


#### Gradient Boosting

In [56]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb = GradientBoostingClassifier()
model_gb.fit(X_train, y_train)

train_score_gb = model_gb.score(X_train, y_train)
test_score_gb = model_gb.score(X_test, y_test)

print(f"Train score (Gradient Boosting): {round(train_score_gb, 4) * 100} % \nTest score (Gradient Boosting): {round(test_score_gb, 4) * 100} %")


Train score (Gradient Boosting): 87.25 % 
Test score (Gradient Boosting): 45.5 %


#### Neural Network

In [57]:
from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
model_mlp.fit(X_train, y_train)

train_score_mlp = model_mlp.score(X_train, y_train)
test_score_mlp = model_mlp.score(X_test, y_test)

print(f"Train score (Neural Network): {round(train_score_mlp, 4) * 100} % \nTest score (Neural Network): {round(test_score_mlp, 4) * 100} %")


Train score (Neural Network): 50.62 % 
Test score (Neural Network): 48.0 %


## Defining CUSTUMER TYPE as the target variable

In [58]:
y=df['Customer type']
X=df.drop('Customer type',axis=1)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
model=LinearSVC()
model.fit(X_train, y_train)

In [61]:
print(f"Train score: {round(model.score(X_train, y_train), 4) * 100} % \nTest score: {round(model.score(X_test, y_test), 4) * 100} %")

Train score: 49.5 % 
Test score: 51.5 %


## Defining PRODUCT LINE as the target variable

In [62]:
y=df['Product line']
X=df.drop('Product line',axis=1)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
model=LinearSVC()
model.fit(X_train, y_train)

In [65]:
print(f"Train score: {round(model.score(X_train, y_train), 4) * 100} % \nTest score: {round(model.score(X_test, y_test), 4) * 100} %")

Train score: 20.75 % 
Test score: 13.5 %
