In [129]:
import numpy as np
import pandas as pd
from sklearn import model_selection, linear_model, metrics, ensemble
import xgboost as xgb

In [40]:
tweets = pd.DataFrame.from_csv("Tweets data.csv", encoding = "ISO-8859-1")

In [45]:
def ht(a):
    n = 25
    res = np.zeros(n)
    for ind in range(0, len(a) - 2):
        res[hash(a[ind:ind+2]) % n] += 1
    return pd.Series(res)

In [49]:
feat = tweets.SentimentText.apply(ht)

In [57]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(feat, tweets.Sentiment, random_state=42)

In [58]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((74991, 25), (24998, 25), (74991,), (24998,))

In [60]:
clf = linear_model.LogisticRegression()

In [61]:
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [65]:
metrics.roc_auc_score(y_train, clf.predict(x_train))

0.5479203840312984

In [66]:
metrics.roc_auc_score(y_test, clf.predict(x_test))

0.5473353855015297

In [71]:
clf.predict_proba(ht('hello').reshape(1, -1))

  """Entry point for launching an IPython kernel.


array([[0.38989317, 0.61010683]])

In [76]:
clf = linear_model.LogisticRegression()

In [77]:
clf.fit(np.array([[0, 0], [1, 1]]), np.array([0, 1]))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [80]:
clf.predict(np.array([[0, 0]]))

array([0])

In [81]:
clf.predict(np.array([[1, 1]]))

array([1])

In [83]:
pool = xgb.DMatrix(x_train, label=y_train)

In [86]:
pool_test = xgb.DMatrix(x_test, label=y_test)

In [127]:
param = {'eta': 0.1,
         'objective': 'binary:logistic',
         'eval_metric': ['auc', 'logloss'], 
         'subsample': 0.8,
         'max_depth': 3
        }

In [128]:
clf = xgb.train(param, 
                pool, 
                num_boost_round=1000,
                early_stopping_rounds=10,
                evals=[(pool, 'train'), 
                       (pool_test, 'test')]
               )

[0]	train-auc:0.557462	train-logloss:0.690511	test-auc:0.550335	test-logloss:0.690761
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 10 rounds.
[1]	train-auc:0.564288	train-logloss:0.688303	test-auc:0.557095	test-logloss:0.688766
[2]	train-auc:0.56848	train-logloss:0.686391	test-auc:0.560253	test-logloss:0.687095
[3]	train-auc:0.57461	train-logloss:0.684754	test-auc:0.564225	test-logloss:0.685667
[4]	train-auc:0.577194	train-logloss:0.683399	test-auc:0.568267	test-logloss:0.684461
[5]	train-auc:0.577722	train-logloss:0.682197	test-auc:0.569454	test-logloss:0.683354
[6]	train-auc:0.582494	train-logloss:0.681096	test-auc:0.57274	test-logloss:0.682456
[7]	train-auc:0.582857	train-logloss:0.680206	test-auc:0.573505	test-logloss:0.681667
[8]	train-auc:0.584344	train-logloss:0.679409	test-auc:0.575099	test-logloss:0.680969
[9]	train-auc:0.58653	train-logloss:0.678638	test-auc:0.577251	test-logloss:0.68

In [103]:
clf

<xgboost.core.Booster at 0x7fd980f53da0>

In [107]:
metrics.roc_auc_score(pool.get_label(), clf.predict(pool))

0.6834326947038624

In [108]:
metrics.roc_auc_score(pool_test.get_label(), clf.predict(pool_test))

0.6114461274781781

In [187]:
clf = ensemble.RandomForestClassifier(max_depth=4,
                                      n_estimators=100,
                                      n_jobs=4,
                                     criterion='gini')

In [188]:
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [189]:
metrics.roc_auc_score(y_train, clf.predict_proba(x_train)[:, 1])

0.5969794774005208

In [191]:
!head gender_submission.csv

PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1


In [190]:
metrics.roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])

0.5864368224248238

In [192]:
!head train.csv

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S


In [193]:
data = pd.read_csv('train.csv')

In [195]:
data.head(n=3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [205]:
x_dat = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']]

In [206]:
x_dat.head(n=2)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
0,3,22.0,1,0,7.25,0
1,1,38.0,1,0,71.2833,1


In [207]:
y_dat = data.Survived

In [208]:
x_dat_pure = x_dat.dropna()

In [209]:
y_dat.shape

(891,)

In [217]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    x_dat.drop('Survived', axis = 1), x_dat.Survived)

In [218]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((668, 5), (668,), (223, 5), (223,))

In [222]:
pool = xgb.DMatrix(x_train, label=y_train)
pool_test = xgb.DMatrix(x_test, label=y_test)

In [231]:
param = {'eta': 0.01,
         'objective': 'binary:logistic',
         'eval_metric': ['auc', 'logloss'], 
         'subsample': 0.8,
         'max_depth': 3
        }

In [232]:
clf = xgb.train(param, 
                pool, 
                num_boost_round=1000,
                early_stopping_rounds=10,
                evals=[(pool, 'train'), 
                       (pool_test, 'test')]
               )

[0]	train-auc:0.737332	train-logloss:0.690812	test-auc:0.705507	test-logloss:0.690892
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 10 rounds.
[1]	train-auc:0.756247	train-logloss:0.68854	test-auc:0.718619	test-logloss:0.688852
[2]	train-auc:0.762146	train-logloss:0.686473	test-auc:0.710315	test-logloss:0.686854
[3]	train-auc:0.763457	train-logloss:0.684319	test-auc:0.730638	test-logloss:0.684771
[4]	train-auc:0.762626	train-logloss:0.682191	test-auc:0.729939	test-logloss:0.682882
[5]	train-auc:0.765356	train-logloss:0.680102	test-auc:0.728977	test-logloss:0.680814
[6]	train-auc:0.771542	train-logloss:0.678126	test-auc:0.719668	test-logloss:0.679116
[7]	train-auc:0.778729	train-logloss:0.67626	test-auc:0.738199	test-logloss:0.677267
[8]	train-auc:0.7821	train-logloss:0.674166	test-auc:0.743706	test-logloss:0.675268
[9]	train-auc:0.783171	train-logloss:0.672032	test-auc:0.733261	test-logloss:0.67

In [252]:
metrics.accuracy_score(pool.get_label(),  clf.predict(pool)>0.5)

0.7425149700598802

In [275]:
metrics.accuracy_score(pool_test.get_label(),  clf.predict(pool_test)>0.5)

0.7443946188340808

In [253]:
data_test = pd.read_csv('test.csv')

In [256]:
submit_data = data_test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

In [258]:
pool_submit = xgb.DMatrix(submit_data)

In [263]:
submit_result = (clf.predict(pool_submit) > 0.5) * 1

In [265]:
my_submission = pd.DataFrame({'PassengerId': data_test.PassengerId, 
                              'Survived': submit_result})

In [266]:
my_submission.head(n = 3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0


In [273]:
with open('submit.csv', 'w') as fd:
    fd.write(my_submission.to_csv(index=False))

In [274]:
!head submit.csv

PassengerId,Survived
892,0
893,0
894,0
895,0
896,0
897,0
898,0
899,1
900,0


In [270]:
!head gender_submission.csv

PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1


In [271]:
help(my_submission.to_csv)

Help on method to_csv in module pandas.core.frame:

to_csv(path_or_buf=None, sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, escapechar=None, decimal='.') method of pandas.core.frame.DataFrame instance
    Write DataFrame to a comma-separated values (csv) file
    
    Parameters
    ----------
    path_or_buf : string or file handle, default None
        File path or object, if None is provided the result is returned as
        a string.
    sep : character, default ','
        Field delimiter for the output file.
    na_rep : string, default ''
        Missing data representation
    float_format : string, default None
        Format string for floating point numbers
    columns : sequence, optional
        Columns to write
    header : boolean or list of string, default T