In [9]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
%matplotlib inline

In [10]:
header_names = ['PId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Tic', 'Fare', 'Cabin', 'Embarked']

In [11]:
my_data = read_csv ('C:/dataset/titanic.csv', names=header_names, header=None, skiprows=1)

In [12]:
print(my_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PId       891 non-null    int64  
 1   Survived  891 non-null    int64  
 2   Pclass    891 non-null    int64  
 3   Name      891 non-null    object 
 4   Sex       891 non-null    object 
 5   Age       714 non-null    float64
 6   SibSp     891 non-null    int64  
 7   Parch     891 non-null    int64  
 8   Tic       891 non-null    object 
 9   Fare      891 non-null    float64
 10  Cabin     204 non-null    object 
 11  Embarked  889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [13]:
print(my_data.shape)

(891, 12)


In [14]:
print(my_data.head(5))

   PId  Survived  Pclass                                               Name  \
0    1         0       3                            Braund, Mr. Owen Harris   
1    2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2    3         1       3                             Heikkinen, Miss. Laina   
3    4         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4    5         0       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch               Tic     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
4    male  35.0      0      0            373450   8.0500   NaN        S  


In [15]:
print('Number of empty values per variable\n', my_data.isna().sum())

Number of empty values per variable
 PId           0
Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Tic           0
Fare          0
Cabin       687
Embarked      2
dtype: int64


In [16]:
empty_value_data = my_data[my_data.isna().any(axis=1)]
print(empty_value_data.shape)
print(len(empty_value_data))

(708, 12)
708


In [17]:
my_data.fillna(my_data.mean(), inplace=True)

In [18]:
print('Total number of empty values per variable\n', my_data.isna().sum())

Total number of empty values per variable
 PId           0
Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Tic           0
Fare          0
Cabin       687
Embarked      2
dtype: int64


In [19]:
labelEncoder = LabelEncoder()
labelEncoder.fit(my_data['Sex'])
my_data['Sex'] = labelEncoder.transform(my_data['Sex'])

In [20]:
print(my_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   PId       891 non-null    int64  
 1   Survived  891 non-null    int64  
 2   Pclass    891 non-null    int64  
 3   Name      891 non-null    object 
 4   Sex       891 non-null    int32  
 5   Age       891 non-null    float64
 6   SibSp     891 non-null    int64  
 7   Parch     891 non-null    int64  
 8   Tic       891 non-null    object 
 9   Fare      891 non-null    float64
 10  Cabin     204 non-null    object 
 11  Embarked  889 non-null    object 
dtypes: float64(2), int32(1), int64(5), object(4)
memory usage: 80.2+ KB
None


In [21]:
my_data.describe()

Unnamed: 0,PId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,0.47799,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,1.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [30]:
my_data = my_data.drop(['Name', 'Tic', 'Cabin', 'Embarked'], axis=1)
X_data = np.array(my_data.drop(['Sex'], 1).astype(float))

KeyError: "['Name' 'Tic' 'Cabin' 'Embarked'] not found in axis"

In [23]:
y_data = np.array(my_data['Survived'])

In [25]:
kmeans_model = KMeans (n_clusters=2)

In [26]:
kmeans_model.fit(X_data)

KMeans(n_clusters=2)

In [27]:
correct = 0
for i in range(len(X_data)):
    predict_data = np.array(X_data[i].astype(float))
    predict_data = predict_data.reshape(-1, len(predict_data))
    prediction = kmeans_model.predict(predict_data)
    if prediction[0] == y_data[i]:
        correct+= 1

In [28]:
print(correct/len(X_data))

0.468013468013468
