## Classification
## Example: Predict survival on Titanic

In [157]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

#import matplotlib.pyplot as plt
#%matplotlib inline

## Working with data

In [158]:
train = pd.read_csv(r'C:\Users\lizac\Downloads\homework-master (8)\homework-master\lecture_2\data\train.csv')
test = pd.read_csv(r'C:\Users\lizac\Downloads\homework-master (8)\homework-master\lecture_2\data\test.csv')

In [159]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [160]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [161]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### We'll need test_pas_id for submission dataframe

In [162]:
test_pas_id = test['PassengerId']

### Make a list from train and test

In [163]:
full_data=[train, test]


### Impute missing values

#### Embarked

In [164]:
train[train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [165]:
train[(train['Fare']>79) & (train['Fare']<81) & (train['Pclass']==1)].groupby('Embarked').size()

Embarked
C    4
S    3
dtype: int64

In [166]:
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('C')

#### Fare

In [167]:
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

####  Age

In [168]:
# We have plenty of missing values in this feature. 
# Generate random numbers between (mean - std) and (mean + std). 

In [169]:
np.random.seed(0)
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Data preprocessing

In [170]:
np.unique(full_data[0]['Sex'], return_counts = True)

(array(['female', 'male'], dtype=object), array([314, 577], dtype=int64))

In [171]:
  for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} )

In [172]:
for dataset in full_data:    
    #Discrete variables
    dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 1 #initialize to yes/1 is alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [173]:
np.unique(full_data[0]['Embarked'], return_counts = True)

(array(['C', 'Q', 'S'], dtype=object), array([170,  77, 644], dtype=int64))

In [174]:
for dataset in full_data:
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'C': 0, 'Q': 1, 'S': 2} ).astype(int)

In [175]:
for dataset in full_data:
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

In [176]:
for dataset in full_data:
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)

In [177]:
for dataset in full_data:
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)

In [178]:
#title
stat_min=10
data=pd.concat([train,test])
title_names = (data['Title'].value_counts() < stat_min)
title_names=pd.DataFrame(title_names).reset_index()
title_names=title_names[title_names['Title']==False]['index'].values
for dataset in full_data:
 
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if x not in title_names else x)

In [179]:
for dataset in full_data:
    # Mapping Title
    dataset['Title'] = dataset['Title'].map( {'Master': 0, 'Misc': 1, 'Miss': 2,'Mr':3,'Mrs':4} ).astype(int)

In [180]:
np.unique(full_data[0]['AgeBin'], return_counts = True)

(array([Interval(-0.08, 16.0, closed='right'),
        Interval(16.0, 32.0, closed='right'),
        Interval(32.0, 48.0, closed='right'),
        Interval(48.0, 64.0, closed='right'),
        Interval(64.0, 80.0, closed='right')], dtype=object),
 array([117, 447, 247,  69,  11], dtype=int64))

In [181]:
#encoding interval values
label = LabelEncoder()
for dataset in full_data:
    dataset['FareBin'] = label.fit_transform(dataset['FareBin'])
    dataset['AgeBin'] = label.fit_transform(dataset['AgeBin'])

In [182]:
np.unique(full_data[0]['Title'], return_counts = True)

(array([0, 1, 2, 3, 4]), array([ 40,  27, 182, 517, 125], dtype=int64))

In [183]:
factors_train = train[['Pclass','Embarked','Title','FareBin','AgeBin']]
factors_test = test[['Pclass','Embarked','Title','FareBin','AgeBin']]

In [184]:

enc = preprocessing.OneHotEncoder(dtype = 'int32')
enc.fit(factors_train)

OneHotEncoder(categorical_features='all', dtype='int32',
       handle_unknown='error', n_values='auto', sparse=True)

In [185]:
tfactors_train = enc.transform(factors_train).toarray()
tfactors_train

array([[0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int32)

In [186]:
tfactors_test = enc.transform(factors_test).toarray()
tfactors_test[:5,]

array([[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0]],
      dtype=int32)

In [187]:
train_pclass_emb = pd.DataFrame(tfactors_train, columns = ('Pclass_1', 'Pclass_2', 'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S','Master', 'Misc', 'Miss','Mr','Mrs',
                                                          'Fare_1','Fare_2','Fare_2','Fare_4','Age_1','Age_2','Age_3','Age_4','Age_5'))

In [188]:
train_pclass_emb.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Emb_C,Emb_Q,Emb_S,Master,Misc,Miss,Mr,Mrs,Fare_1,Fare_2,Fare_2.1,Fare_4,Age_1,Age_2,Age_3,Age_4,Age_5
0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0
1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0
2,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0
4,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0


In [189]:
test_pclass_emb = pd.DataFrame(tfactors_test, columns =('Pclass_1', 'Pclass_2', 'Pclass_3', 'Emb_C', 'Emb_Q', 'Emb_S','Master', 'Misc', 'Miss','Mr','Mrs',
                                                          'Fare_1','Fare_2','Fare_2','Fare_4','Age_1','Age_2','Age_3','Age_4','Age_5'))

In [190]:
test_pclass_emb.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3,Emb_C,Emb_Q,Emb_S,Master,Misc,Miss,Mr,Mrs,Fare_1,Fare_2,Fare_2.1,Fare_4,Age_1,Age_2,Age_3,Age_4,Age_5
0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0
1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0
2,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1
3,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0
4,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0


### Feature Selection

In [191]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Pclass', 'Embarked','Age','Fare','Title']
train = train.drop(drop_elements, axis = 1)
test = test.drop(drop_elements, axis = 1)

In [192]:
train = pd.concat([train,train_pclass_emb], axis=1)
test = pd.concat([test,test_pclass_emb], axis=1)

In [193]:
train.head(5)

Unnamed: 0,Survived,Sex,SibSp,Parch,FamilySize,IsAlone,FareBin,AgeBin,Pclass_1,Pclass_2,...,Mrs,Fare_1,Fare_2,Fare_2.1,Fare_4,Age_1,Age_2,Age_3,Age_4,Age_5
0,0,1,1,0,2,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
1,1,0,1,0,2,0,3,2,1,0,...,1,0,0,0,1,0,0,1,0,0
2,1,0,0,0,1,1,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
3,1,0,1,0,2,0,3,2,1,0,...,1,0,0,0,1,0,0,1,0,0
4,0,1,0,0,1,1,1,2,0,0,...,0,0,1,0,0,0,0,1,0,0


In [194]:
trainv = train.values

In [195]:
trainv.shape

(891, 28)

In [196]:
X = trainv[0:, 1:]
y = trainv[0:, 0]

In [197]:
# Standardize features by removing the mean and scaling to unit variance
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)



### Data prepared to predict for submission

In [198]:
Xnew = test.values
Xnew.shape

(418, 27)

## Modeling

In [199]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

In [200]:
# Split to train and test
# 75% and 25% by default
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=2)
print(Xtrain.shape, Xtest.shape)

(668, 27) (223, 27)


In [201]:
np.unique(ytrain, return_counts = True)

(array([0, 1], dtype=int64), array([418, 250], dtype=int64))

In [202]:
np.unique(ytest, return_counts = True)

(array([0, 1], dtype=int64), array([131,  92], dtype=int64))

In [203]:
# http://scikit-learn.org

## LogisticRegression

In [204]:
# http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [205]:
# Model
model_lgr = LogisticRegression(random_state = 1)
print(model_lgr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [206]:
# C - Inverse of regularization strength; must be a positive float.
# Smaller values specify stronger regularization.

### Fit the model

In [207]:
model_lgr.fit(Xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Model fit parameters

In [208]:
model_lgr.coef_

array([[-1.17484332, -0.57707248, -0.09031429, -0.43952872, -0.18343808,
         0.13217863, -0.21320633,  0.4122729 ,  0.08751297, -0.42643923,
         0.08780048,  0.03459227, -0.09878644,  0.65446293, -0.03344339,
        -0.05116449, -0.32604506,  0.14901155, -0.11975548, -0.01340086,
         0.04517379,  0.08820232,  0.0874286 ,  0.04764232, -0.00231699,
        -0.02687893, -0.40868046]])

In [209]:
params = pd.Series(model_lgr.coef_.reshape(11,), index=train.columns[1:])
params

ValueError: cannot reshape array of size 27 into shape (11,)

In [None]:
model_lgr.intercept_

### Model validation

In [None]:
# Predict on train

ypred_train = model_lgr.predict(Xtrain)
ypred_train_proba = model_lgr.predict_proba(Xtrain)

In [None]:
# Predict on test

ypred = model_lgr.predict(Xtest)
print(ypred[:10])

ypred_proba = model_lgr.predict_proba(Xtest)
print(ypred_proba[:5,:])

# ypred_proba[:,0] - probability for class zero (not survived), 
# ypred_proba[:,1] - probability for class one - survived

#### Metrics: accuracy, confusion matrix, classification report, AUC
#### http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics

In [None]:
# We can check our classification accuracy by comparing 
# the true values of the test set to the predictions:

In [None]:
# Accuracy on train
accuracy_score(ytrain, ypred_train)

In [None]:
# Accuracy on test
accuracy_score(ytest, ypred)

In [None]:
# Score for classification models is accuracy
model_lgr.score(Xtest, ytest)

In [None]:
# Accuracy doesn't tell us where we've gone wrong: 
# one nice way to do this is to use the confusion matrix

In [None]:
print(confusion_matrix(ytest, ypred))

In [None]:
target_names = ['not survived', 'survived']
print(classification_report(ytest, ypred, target_names=target_names))

In [None]:
# AUC
# y_scores -  probability estimates of the positive class

print("AUC on traint =", roc_auc_score(ytrain, ypred_train_proba[:, 1]))
print("AUC on test =", roc_auc_score(ytest, ypred_proba[:, 1]))

#### <span style="color:red">Submission to kaggle a prediction for Xnew with model_lgr was given a score (accuracy) 0.7799</span>

### K-fold Cross-Validation

In [None]:
# http://scikit-learn.org/stable/modules/cross_validation.html

from sklearn.model_selection import cross_val_score

In [None]:
lgr = LogisticRegression(random_state = 1)

# Split to train and test: 80% and 20% 
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=2, test_size=0.2)

# Train, valid, test
scores = cross_val_score(lgr, Xtrain, ytrain, cv=5)
scores

In [None]:
print("Mean cv accuracy : %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
lgr.fit(Xtrain,ytrain)
print("Accuracy on train",lgr.score(Xtrain,ytrain))
print("Accuracy on test", lgr.score(Xtest, ytest))

### Hyperparameters Grid Search

In [None]:
# http://scikit-learn.org/stable/modules/grid_search.html#grid-search

# GridSearchCV exhaustively considers all parameter combinations

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [.001, .01, 1, 10],
              'penalty': ['l1', 'l2']}
lgr = LogisticRegression(random_state=1)
grid = GridSearchCV(lgr, param_grid, cv=5)

In [None]:
grid.fit(Xtrain, ytrain)

In [None]:
# Mean cross-validated score of the best_estimator
grid.best_score_

In [None]:
grid.best_params_

In [None]:
model = grid.best_estimator_

In [None]:
print(model)

In [None]:
model.score(Xtest,ytest)

In [None]:
model.score(Xtrain,ytrain)

### Save / load a model

In [None]:
from sklearn.externals import joblib
joblib.dump(model, 'model.pkl') 

In [None]:
model1 = joblib.load('model.pkl') 

In [None]:
print(model1)

In [None]:
ypred = model1.predict(Xtest)
ypred[:10]

### RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=2)
print(Xtrain.shape, Xtest.shape)

In [None]:
rfc = RandomForestClassifier(random_state = 2)
rfc.fit(Xtrain, ytrain)

In [None]:
rfc.score(Xtrain, ytrain)

In [None]:
rfc.score(Xtest, ytest)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(Xtrain, ytrain)
Y_pred = decision_tree.predict(Xtest)
acc_decision_tree = round(decision_tree.score(Xtrain, ytrain) * 100, 2)
acc_decision_tree

In [None]:
decision_tree.score(Xtest, ytest)

In [None]:
featires_imp = pd.Series(rfc.feature_importances_, index=train.columns[1:])
featires_imp

In [None]:
ypred_test = rfc.predict(Xtest)

In [None]:
target_names = ['not survived', 'survived']
print(classification_report(ytest, ypred_test, target_names=target_names))

#### Hyperparameters Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [5, 10, 15, 20,200],
             'max_depth': [2, 3, 4, 5, 6, 7, 8]}
grid = GridSearchCV(RandomForestClassifier(random_state = 2), param_grid, cv=3)

In [None]:
grid.fit(Xtrain, ytrain)

In [None]:
# Mean cross-validated score of the best_estimator
grid.best_score_

In [None]:
grid.best_params_

In [None]:
best_rfc = grid.best_estimator_

In [None]:
best_rfc.score(Xtest,ytest)

In [None]:
best_rfc.score(Xtrain,ytrain)

### GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(random_state = 2)

In [None]:
gbc.fit(Xtrain,ytrain)

In [None]:
gbc.score(Xtrain,ytrain)

In [None]:
gbc.score(Xtest,ytest)

In [None]:
# learning_rate, n_estimators, max_depth

#### Hyperparameters Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [10, 20, 30, 40, 50, 75, 100],
             'max_depth': [2, 3, 4, 5, 6]}
grid = GridSearchCV(GradientBoostingClassifier(random_state = 2), param_grid, cv=4)

In [None]:
grid.fit(Xtrain, ytrain)

In [None]:
# Mean cross-validated score of the best_estimator
grid.best_score_

In [None]:
grid.best_params_

In [None]:
best_gbc = grid.best_estimator_

In [None]:
best_gbc.score(Xtest,ytest)

In [None]:
best_gbc.score(Xtrain,ytrain)

### XGBoost
#### http://xgboost.readthedocs.io/en/latest/python/python_intro.html

In [None]:
grid_n_estimator = [10, 50, 100, 300]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.01, .03, .05, .1, .25]
grid_max_depth = [2, 4, 6, 8, 10, None]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
'learning_rate': grid_learn, #default: .3
            'max_depth': [1,2,4,6,8,10], #default 2
            'n_estimators': grid_n_estimator, 
            'seed': grid_seed  
}

# Instantiate the regressor: gbm
gbm = XGBClassifier(n_estimators=10)

# Perform random search: grid_mse
xgb_random = RandomizedSearchCV(param_distributions=gbm_param_grid, 
                                    estimator = gbm, scoring = "accuracy", 
                                    verbose = 4, n_iter = 100, cv = 4)


# Fit randomized_mse to the data
xgb_random.fit(Xtrain, ytrain)

In [None]:
xgb_random.score(Xtest,ytest)

### Submission

In [None]:
ypred_Xnew = model_lgr.predict(Xnew).astype(int)

In [None]:
# Generate Submission File 

# Use model with the best accuracy on test to predict on Xnew (ypred_Xnew should be int)

# Example: ypred_Xnew = model_lgr.predict(Xnew).astype(int)

submission = pd.DataFrame({ 'PassengerId': test_pas_id,
                            'Survived': ypred_Xnew })
submission.to_csv("submission.csv", index=False)

1) Register on https://www.kaggle.com
2) Go to https://www.kaggle.com/c/titanic/submit
3) Submit your csv file and get the score (accuracy)