# Logistic regression model

## Table of Contents


<div>

1. [Data preparation and analysis](#0)<br>
2. [Model Evaluation](#1)<br>
3. [Predicting a gender for a random session_id](#2)<br>
</div>
<hr>


## Data preparation and analysis <a id="0"></a>

In [1]:
import pandas as pd
import numpy as np

In [4]:
pr_df = pd.read_csv('/Users/Vera/Downloads/product.csv')
pr_df.head()

Unnamed: 0,session_id,sequence_order,category_a,category_b,category_c,category_d
0,u10001,1,A00001,B00001,C00001,D00001
1,u10002,1,A00002,B00002,C00002,D24897
2,u10003,1,A00002,B00002,C00002,D00002
3,u10003,2,A00002,B00002,C00003,D00003
4,u10003,3,A00002,B00002,C00007,D00007


In [5]:
sn_df = pd.read_csv('/Users/Vera/Downloads/session.csv')
sn_df.head()

Unnamed: 0,session_id,start_time,end_time,gender
0,u10001,2014-11-14 00:02:14,2014-11-14 00:02:20,female
1,u10002,2014-12-12 14:12:05,2014-12-12 14:12:06,female
2,u10003,2014-11-14 00:02:41,2014-11-14 00:16:40,female
3,u10004,2014-11-14 00:21:55,2014-11-14 00:21:57,female
4,u10005,2014-11-14 00:26:08,2014-11-14 00:29:25,female


In [6]:
pr_df[['session_id','category_a','category_b','category_c','category_d']].describe()

Unnamed: 0,session_id,category_a,category_b,category_c,category_d
count,66491,66491,66491,66491,66491
unique,30000,11,91,440,36092
top,u12532,A00002,B00002,C00007,D00266
freq,43,42801,15874,6064,161


In [5]:
sn_df.describe()

Unnamed: 0,session_id,start_time,end_time,gender
count,30000,30000,30000,15000
unique,30000,29754,29727,2
top,u21644,2014-12-09 14:51:16,2014-11-21 15:33:44,female
freq,1,3,3,11703


In [7]:
df = pr_df.merge(sn_df,left_on='session_id',right_on='session_id',how='left')
df = df.dropna()
columns = ['category_a','category_b','category_c','category_d','gender']
data = df[columns]
data.describe()

Unnamed: 0,category_a,category_b,category_c,category_d,gender
count,33455,33455,33455,33455,33455
unique,11,86,383,21880,2
top,A00002,B00002,C00007,D00266,female
freq,21370,8076,3028,89,26384


In [8]:
df.head()

Unnamed: 0,session_id,sequence_order,category_a,category_b,category_c,category_d,start_time,end_time,gender
0,u10001,1,A00001,B00001,C00001,D00001,2014-11-14 00:02:14,2014-11-14 00:02:20,female
1,u10002,1,A00002,B00002,C00002,D24897,2014-12-12 14:12:05,2014-12-12 14:12:06,female
2,u10003,1,A00002,B00002,C00002,D00002,2014-11-14 00:02:41,2014-11-14 00:16:40,female
3,u10003,2,A00002,B00002,C00003,D00003,2014-11-14 00:02:41,2014-11-14 00:16:40,female
4,u10003,3,A00002,B00002,C00007,D00007,2014-11-14 00:02:41,2014-11-14 00:16:40,female


### Преобразуем данные из DataFrame data в численный формат
1. female -- 0
2. male -- 1

In [7]:
for col in range(0,data.shape[1]) :
    for line in range(0,data.shape[0]) :
        if col == 4 :
            if data.values[line][col] == 'female' : data.values[line][col] = 0
            elif data.values[line][col] == 'male' : data.values[line][col] = 1
            continue
        data.values[line][col] = int(data.values[line][col].lstrip('ABCD'))
        
data.head(5)

Unnamed: 0,category_a,category_b,category_c,category_d,gender
0,1,1,1,1,0
1,2,2,2,24897,0
2,2,2,2,2,0
3,2,2,3,3,0
4,2,2,7,7,0


In [8]:
data.gender.value_counts()

0    26384
1     7071
Name: gender, dtype: int64

### Deviding data into train, test datasets

In [9]:
from sklearn.model_selection import train_test_split
X = np.array(data[columns[0:4]])
Y = np.array(data.gender)
Y=Y.astype('int')
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20)

## Model Evaluation  <a id="1"></a>

### Импортируем библиотеки ML

In [10]:
# import scikit-learn libraries
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

In [11]:
print(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape)

(26764, 4) (26764,) (6691, 4) (6691,)


In [12]:
reg = LogisticRegression()
reg.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
Y_pred = reg.predict(X_test)

In [14]:
print(confusion_matrix(Y_test, Y_pred))

[[5152  149]
 [1081  309]]


In [15]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.83      0.97      0.89      5301
           1       0.67      0.22      0.33      1390

    accuracy                           0.82      6691
   macro avg       0.75      0.60      0.61      6691
weighted avg       0.80      0.82      0.78      6691



# Predicting a gender for a random session_id <a id="2"></a>


## Preparing data for prediction

In [17]:
id = 'u25001'

In [26]:
def data_for_gender_prediction(id):
    ex=pr_df[pr_df.session_id==id]
    ex1=ex.iloc[:,2:6]
    ex_size=ex1.shape
    for i in range(0,ex_size[0]):
        for j in range(0,ex_size[1]):
            ex1.values[i][j]= ex1.values[i][j].lstrip('ABCD')
    return ex1

In [27]:
ex_data=data_for_gender_prediction(id)
ex_data

Unnamed: 0,category_a,category_b,category_c,category_d
33455,2,3,46,1169
33456,2,2,3,1457
33457,2,3,14,1478


## Prediction of gender

In [28]:
def predict_function(ex_data): 
    pred_ex=reg.predict(ex_data)
    print(pred_ex)
    count_1=0
    count_0=0
    for b in pred_ex:
        if b==0:
            count_0=count_0+1
        else:
            count_1=count_1+1

    if count_0>count_1:
            gender_ex='male'
    else:
            gender_ex='female'
    return gender_ex

In [29]:
g=predict_function(ex_data)
print(g)

[0 0 0]
male


# Saving model

In [37]:
from sklearn.externals import joblib

filename = '/Users/Vera/Downloads/final_logreg_model.sav'
joblib.dump(reg, filename)

['/Users/Vera/Downloads/final_logreg_model.sav']

In [38]:
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, Y_test)
print(result)

0.816170975937827
