In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from tabulate import tabulate

from sklearn.cluster import MeanShift
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import random
random.seed( 30 )

In [2]:
plt.style.use('ggplot')

In [3]:
titanic_data_dir_path = 'D:/Py/My_DS/titanic/titanic'
titanic_file = 'gender_submission.csv'
titanic_train_f_name = 'train.csv'
titanic_test_f_name = 'test.csv'

df_test= pd.read_csv(os.path.join(titanic_data_dir_path,titanic_test_f_name))
df_train= pd.read_csv(os.path.join(titanic_data_dir_path,titanic_train_f_name))
df_y= pd.read_csv(os.path.join(titanic_data_dir_path,titanic_file))

#formatting matter, joining test and train
df = pd.concat([df_train,df_test])

print('-'*100)
df.info()
#preprocessing
original_df = pd.DataFrame.copy(df)
# filling missing data
df.drop(['Name'], 1, inplace=True)
df.fillna(9999, inplace=True)
#changeing values at float
df = df.apply(
    lambda col: pd.factorize(col)[0].astype(np.float64)
    if col.dtype not in ['int64', 'float64'] else col)


print('-'*100)
df_y.info()
print('-'*100)
df.info()

----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Ra

In [4]:
# fare and tickets are connected so this is not true fare
df.sort_values('Ticket').head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,0.0,22.0,1,0,0.0,7.25,0.0,0.0
1,2,1.0,1,1.0,38.0,1,0,1.0,71.2833,1.0,1.0
234,1126,9999.0,1,0.0,39.0,1,0,1.0,71.2833,1.0,1.0
2,3,1.0,3,1.0,26.0,0,0,2.0,7.925,0.0,0.0
3,4,1.0,1,1.0,35.0,1,0,3.0,53.1,2.0,0.0
137,138,0.0,1,0.0,37.0,1,0,3.0,53.1,2.0,0.0
4,5,0.0,3,0.0,35.0,0,0,4.0,8.05,0.0,0.0
5,6,0.0,3,0.0,9999.0,0,0,5.0,8.4583,0.0,2.0
146,1038,9999.0,1,0.0,9999.0,0,0,6.0,51.8625,3.0,0.0
6,7,0.0,1,0.0,54.0,0,0,6.0,51.8625,3.0,0.0


In [5]:
df_tmp = df.groupby('Ticket').agg(['count'])
df_tmp = df_tmp.iloc[:, 0:1]
df_tmp.columns =['count']
df = df.merge(df_tmp, on='Ticket',how = 'left')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,count
0,1,0.0,3,0.0,22.0,1,0,0.0,7.25,0.0,0.0,1
1,2,1.0,1,1.0,38.0,1,0,1.0,71.2833,1.0,1.0,2
2,3,1.0,3,1.0,26.0,0,0,2.0,7.925,0.0,0.0,1
3,4,1.0,1,1.0,35.0,1,0,3.0,53.1,2.0,0.0,2
4,5,0.0,3,0.0,35.0,0,0,4.0,8.05,0.0,0.0,1


In [6]:
#calculating true fare and droping columns
df['true_fare'] = round(df['Fare'] / df['count'],2)
df.drop(['count','Ticket','Fare','Cabin'], 1, inplace=True)

In [7]:
# making dummies
df = pd.get_dummies(df,columns=['Embarked','Sex','Pclass'], dtype=float)
df.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,true_fare,Embarked_0.0,Embarked_1.0,Embarked_2.0,Embarked_3.0,Sex_0.0,Sex_1.0,Pclass_1,Pclass_2,Pclass_3
0,1,0.0,22.0,1,0,7.25,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2,1.0,38.0,1,0,35.64,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,3,1.0,26.0,0,0,7.92,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,4,1.0,35.0,1,0,26.55,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,5,0.0,35.0,0,0,8.05,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [36]:
# last preprocessing
x_train = df[df['Survived'] < 2]
x_test = df[df['Survived'] ==9999.0]
y_train = x_train['Survived']
x_train.drop(['PassengerId','Survived'],1,inplace = True)
x_test.drop(['PassengerId','Survived'],1,inplace = True)

In [9]:
x_train.describe()

Unnamed: 0,Age,SibSp,Parch,true_fare,Embarked_0.0,Embarked_1.0,Embarked_2.0,Embarked_3.0,Sex_0.0,Sex_1.0,Pclass_1,Pclass_2,Pclass_3
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2010.132626,0.523008,0.381594,14.551033,0.722783,0.188552,0.08642,0.002245,0.647587,0.352413,0.242424,0.20651,0.551066
std,3979.87017,1.102743,0.806057,13.574858,0.447876,0.391372,0.281141,0.047351,0.47799,0.47799,0.42879,0.405028,0.497665
min,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,0.0,0.0,7.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,32.0,0.0,0.0,8.05,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,54.0,1.0,0.0,13.825,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
max,9999.0,8.0,6.0,128.08,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
# clustering

cls = KMeans(n_clusters=2)
cls.fit(x_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [30]:

correct = 0
df_y['survial_rate'] = np.nan
for i, x_data in enumerate(x_test.copy().to_numpy()):
    predict_me = np.array(x_data).astype(float)
    predict_me = predict_me.reshape(-1, len(predict_me))
    df_y['survial_rate'].iloc[i] = cls.predict(predict_me)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

In [31]:
correct = len(df_y[df_y['Survived']==df_y['survial_rate']])
acccuracy = correct/len(df_y) * 100

print(correct)
print(len(df_y))
print(acccuracy)

230
418
55.02392344497608


In [32]:
from sklearn import preprocessing, neighbors, svm

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)

In [None]:
accurycy = clf.score(x_test, df_y['Survived'].to_numpy())
print(accurycy)

In [None]:
from joblib import dump, load

In [None]:
dump(clf, 'titanic_svs_linear.joblib') 