## Interfacing Between Pandas and Model Code

In [1]:
import numpy as np

import pandas as pd

import os as o

import sys as ss

import csv as cv

import json as jayz

#import lxml as xml Not working ecen though it is installed


import requests

import sqlite3

# import sqlalchemy as sqa , not working even though installed


import re

import matplotlib.pyplot as plt

from datetime import datetime  as datetm 

from datetime import timedelta as tdel

import seaborn as sns

# Now if you want to do the plotting in pycharm, remember that no matter which method you use the figure output will
# not come unless until you write plt.show()


from io import StringIO

import pytz




In [2]:
# Now, usually before building a machine learning model it is better to perform the data transformation and cleaning
# using pandas. This is called feature engineering.

In [3]:
# Now, the conversion between pandas and other analysis libraries is usually done by using Numpy as the point of contact

In [4]:
data = pd.DataFrame({'x1':np.arange(5.),
                     'x2':['aaha','ohoo','oyeee','naaah','waaah'],
                     'x3':np.arange(-10,0,2)})

data

Unnamed: 0,x1,x2,x3
0,0.0,aaha,-10
1,1.0,ohoo,-8
2,2.0,oyeee,-6
3,3.0,naaah,-4
4,4.0,waaah,-2


In [5]:
data.columns

Index(['x1', 'x2', 'x3'], dtype='object')

In [6]:
data.to_numpy()

array([[0.0, 'aaha', -10],
       [1.0, 'ohoo', -8],
       [2.0, 'oyeee', -6],
       [3.0, 'naaah', -4],
       [4.0, 'waaah', -2]], dtype=object)

In [7]:
# To convert back to dataframe you can paas the same two dimensional array with optional column names

In [8]:
data1 = pd.DataFrame(data.to_numpy(),columns=['ek','do','teen'])

data1

Unnamed: 0,ek,do,teen
0,0.0,aaha,-10
1,1.0,ohoo,-8
2,2.0,oyeee,-6
3,3.0,naaah,-4
4,4.0,waaah,-2


In [9]:
# the to_numpy method is best suitable for use when your entire dataframe object is homogenous. If you use it for 
# heterogenous data the result will be an ndarray.

In [10]:
dats = data.copy()

dats

Unnamed: 0,x1,x2,x3
0,0.0,aaha,-10
1,1.0,ohoo,-8
2,2.0,oyeee,-6
3,3.0,naaah,-4
4,4.0,waaah,-2


In [11]:
dats['stringers'] = ['LP','GD','IM','BS','CE']

dats

Unnamed: 0,x1,x2,x3,stringers
0,0.0,aaha,-10,LP
1,1.0,ohoo,-8,GD
2,2.0,oyeee,-6,IM
3,3.0,naaah,-4,BS
4,4.0,waaah,-2,CE


In [12]:
dats.to_numpy()

array([[0.0, 'aaha', -10, 'LP'],
       [1.0, 'ohoo', -8, 'GD'],
       [2.0, 'oyeee', -6, 'IM'],
       [3.0, 'naaah', -4, 'BS'],
       [4.0, 'waaah', -2, 'CE']], dtype=object)

In [13]:
# Sometimes for training a model you may only need few rows or columns, for doing that it's best to use loc method

In [14]:
data.loc[:,['x1','x3']]

Unnamed: 0,x1,x3
0,0.0,-10
1,1.0,-8
2,2.0,-6
3,3.0,-4
4,4.0,-2


In [15]:
# Now you have studied the category function in chapter 7. We use it for dividing the columns into categories

In [16]:
data['catme'] = pd.Categorical(['a','a','b','b','a'],
                              categories=['a','b'])

data

Unnamed: 0,x1,x2,x3,catme
0,0.0,aaha,-10,a
1,1.0,ohoo,-8,a
2,2.0,oyeee,-6,b
3,3.0,naaah,-4,b
4,4.0,waaah,-2,a


In [17]:
# Now using the dummies variable we can perform some more transformations

In [18]:
dummyme = pd.get_dummies(data.catme , prefix='catmegory')

dummyme

Unnamed: 0,catmegory_a,catmegory_b
0,1,0
1,1,0
2,0,1
3,0,1
4,1,0


In [19]:
data = data.drop('catme',axis='columns').join(dummyme)

data

Unnamed: 0,x1,x2,x3,catmegory_a,catmegory_b
0,0.0,aaha,-10,1,0
1,1.0,ohoo,-8,1,0
2,2.0,oyeee,-6,0,1
3,3.0,naaah,-4,0,1
4,4.0,waaah,-2,1,0


## Creating Model Descriptions with Patsy

In [20]:
# Patsy is a python library for describing stastical models. Patsy models are special string syntaxes.

In [21]:
# Patsy's formulas are special string syntax that look like
#               y ~ x0 + x1

In [22]:
#The syntax does not necessarily means to add  x0 and x1. Instead these are terms created in the design matrix for the model

In [23]:
dare = pd.DataFrame({'xcx':[1,2,3,4,5],
                     'xdx':[0.01,0.22,-12,0.,-1.23],
                     'main':[-11,9,2.44,1.2,-2.]})

dare

Unnamed: 0,xcx,xdx,main
0,1,0.01,-11.0
1,2,0.22,9.0
2,3,-12.0,2.44
3,4,0.0,1.2
4,5,-1.23,-2.0


In [24]:
# import patsy as patty is not working. refer patsylearn.py present in Python_learn

## Introduction to Statsmodels

In [25]:
# statsmodels are more suited for the classical statistical methods.
# some kind of models found in statsmodel include
# 1) Linear models, generalized linear models and robust linear models
# 2) Linear fixed effects models
# 3) Analysis of Variance(ANOVA)
# 4) Time series processes and state space models
# 5) Generalized method of moments
# import statsmodel Not working. Refer intro_statsmodel.py


## Introduction to scikit-learn

In [26]:
trainer = pd.read_csv('datasets/titanic/train.csv')

In [27]:
trainer

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [28]:
tester = pd.read_csv('datasets/titanic/test.csv')

tester

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [29]:
# Libraries like scikit and statsmodel usually cannot be fed missing data so to check if there is missing data we do as follows

trainer.isna().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [30]:
tester.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [31]:
# Now as we know the typical procedure in training and testing is to train the model on training data and then
# predict whether an individual of certain feature from the testing data will survive or not.

In [32]:
# In our case the best feature is to use the age column, but it has missing value. Hence we will fill it with the median of age

In [33]:
filler = trainer['Age'].median()

trainer['Age']=trainer['Age'].fillna(filler)

tester['Age'] = tester['Age'].fillna(filler)

In [34]:
trainer['Age'].isna().sum()

0

In [35]:
# Now we need to specify our model so let's start by assigining a column named IsFemale as encoded version of sex column

In [36]:
trainer['IsFemale'] = (trainer['Sex'] == 'female').astype(int)

tester['IsFemale'] = (tester['Sex'] == 'female').astype(int)

In [37]:

trainer

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsFemale
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0


In [38]:
# Now we decide some model variables and parameters

In [39]:
predictor = ['Pclass','IsFemale','Age']

In [40]:
trainer_copyy = trainer.copy()

trainer_copyy

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsFemale
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0


In [41]:
trainer_copyy[predictor]

Unnamed: 0,Pclass,IsFemale,Age
0,3,0,22.0
1,1,1,38.0
2,3,1,26.0
3,1,1,35.0
4,3,0,35.0
...,...,...,...
886,2,0,27.0
887,1,1,19.0
888,3,1,28.0
889,1,0,26.0


In [42]:
trainer_copyy[predictor].to_numpy()

array([[ 3.,  0., 22.],
       [ 1.,  1., 38.],
       [ 3.,  1., 26.],
       ...,
       [ 3.,  1., 28.],
       [ 1.,  0., 26.],
       [ 3.,  0., 32.]])

In [43]:
trainer_copyy

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsFemale
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0


In [44]:
X_trainer = trainer[predictor].to_numpy()

X_tester = tester[predictor].to_numpy()

y_traincomparor = trainer['Survived'].to_numpy()

In [45]:
print(X_tester)

print(X_trainer)

print(y_traincomparor)

[[ 3.   0.  34.5]
 [ 3.   1.  47. ]
 [ 2.   0.  62. ]
 ...
 [ 3.   0.  38.5]
 [ 3.   0.  28. ]
 [ 3.   0.  28. ]]
[[ 3.  0. 22.]
 [ 1.  1. 38.]
 [ 3.  1. 26.]
 ...
 [ 3.  1. 28.]
 [ 1.  0. 26.]
 [ 3.  0. 32.]]
[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0
 1 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1
 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0
 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0
 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0
 0 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1

In [46]:
# Now we will use the LogisticalRegression from sklearn to do the prediction

In [47]:
from sklearn.linear_model import LogisticRegression as LR

In [48]:

models = LR()

In [49]:
# Now we will pass the training data to the model

In [50]:
models.fit(X_trainer,y_traincomparor)

LogisticRegression()

In [51]:
# Now lets do the prediction

In [52]:
y_beacon_of_truth = models.predict(X_tester)

y_beacon_of_truth

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [53]:
y_traincomparor

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [54]:
# Now if you had true values of the testing dataset you could compute an accuracy percentage

In [55]:
#(y_beacon_of_truth == y_traincomparor).mean()

In [56]:
# Now there are other models as well that have built-in cross-validation for providing more better result


In [57]:
from sklearn.linear_model import LogisticRegressionCV as LRcv

In [58]:
modelme = LRcv()

In [59]:
modelme.fit(X_trainer,y_traincomparor)

LogisticRegressionCV()

In [60]:
# Now we will do the cross-validation

In [61]:
from sklearn.model_selection import cross_val_score as cvs

In [62]:
modelme1 = LR(C=10)

In [63]:
scorer = cvs(modelme1,X_trainer,y_traincomparor,cv=4)

scorer

array([0.77578475, 0.79820628, 0.77578475, 0.78828829])

## 