# Libraries

In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
print(pd.__version__)
print(np.version.version)
print(sys.version)

1.0.5
1.18.5
3.8.3 (default, Jul  2 2020, 11:26:31) 
[Clang 10.0.0 ]


# Reading Data

In [3]:
df_test = pd.read_csv("./Data/test.csv")
df_train = pd.read_csv("./Data/train.csv")

In [4]:
df_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


### Comparing Columns

In [5]:
# find different column
train_columns = pd.DataFrame({"Diff_Column": df_train.columns,
                            "train_data" : range(len(df_train.columns.tolist()))})

test_columns = pd.DataFrame({'Diff_Column': df_test.columns, 
                             'test_data': range(len(df_test.columns.tolist()))})

col_merged = pd.merge(train_columns, test_columns, on = 'Diff_Column', how = 'left') # compare by column name

diff_columns = []
for i in col_merged[col_merged.isnull().any(axis = 1)]['Diff_Column']:
    diff_columns.append(i)

In [6]:
# Change Column

re_index = list(df_test.columns)

for col in df_train.columns:
    if col in diff_columns :
        re_index.append(col)
        
df_train = df_train.reindex(columns = re_index)


In [7]:
df_train.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S,0
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,1
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S,0
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C,1
890,891,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q,0


In [8]:
type(df_train["Parch"])

pandas.core.series.Series

In [9]:
train_info = pd.DataFrame(zip(df_train.columns, df_train.count(),df_train.nunique(), df_train.dtypes))
train_info.columns = ['Column', 'Count', 'Unique', 'Dtype']
test_info = pd.DataFrame(zip(df_test.columns, df_test.count(), df_test.nunique(), df_test.dtypes))
test_info.columns = ['Column', 'Count', 'Unique', 'Dtype']
pd.concat([train_info, test_info], axis = 1, join ='outer',
          keys = ['train info','test info'])#.reindex=[train_info.index]

Unnamed: 0_level_0,train info,train info,train info,train info,test info,test info,test info,test info
Unnamed: 0_level_1,Column,Count,Unique,Dtype,Column,Count,Unique,Dtype
0,PassengerId,891,891,int64,PassengerId,418.0,418.0,int64
1,Pclass,891,3,int64,Pclass,418.0,3.0,int64
2,Name,891,891,object,Name,418.0,418.0,object
3,Sex,891,2,object,Sex,418.0,2.0,object
4,Age,714,88,float64,Age,332.0,79.0,float64
5,SibSp,891,7,int64,SibSp,418.0,7.0,int64
6,Parch,891,7,int64,Parch,418.0,8.0,int64
7,Ticket,891,681,object,Ticket,418.0,363.0,object
8,Fare,891,248,float64,Fare,417.0,169.0,float64
9,Cabin,204,147,object,Cabin,91.0,76.0,object


# Data Processing

### Categorize column

In [10]:
category_list = []

In [11]:
uniques = {col: pd.DataFrame(df_test[col].value_counts()) 
              for col in list(df_test.columns)}

In [12]:
uniques

{'PassengerId':       PassengerId
 1023            1
 1128            1
 1156            1
 1157            1
 1158            1
 ...           ...
 1305            1
 1306            1
 1307            1
 1308            1
 1024            1
 
 [418 rows x 1 columns],
 'Pclass':    Pclass
 3     218
 1     107
 2      93,
 'Name':                                               Name
 Nourney, Mr. Alfred (Baron von Drachstedt")"     1
 Hirvonen, Mrs. Alexander (Helga E Lindqvist)     1
 Boulos, Master. Akar                             1
 Ilmakangas, Miss. Ida Livija                     1
 Hilliard, Mr. Herbert Henry                      1
 ...                                            ...
 Ryerson, Mr. Arthur Larned                       1
 Everett, Mr. Thomas James                        1
 Brandeis, Mr. Emil                               1
 Bentham, Miss. Lilian W                          1
 Brobeck, Mr. Karl Rudolf                         1
 
 [418 rows x 1 columns],
 'Sex':         

In [13]:
for i in uniques :
    if len(uniques[i]) <= 10: category_list.append(i)

In [14]:
category_list

['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

In [15]:
for col in category_list :
    df_train[col].astype('category')
    df_test[col].astype('category')

### Missing Value

In [16]:
round((df_train.isnull().sum()/len(df_test)*100).sort_values(ascending=False),1)

Cabin          164.4
Age             42.3
Embarked         0.5
Survived         0.0
Fare             0.0
Ticket           0.0
Parch            0.0
SibSp            0.0
Sex              0.0
Name             0.0
Pclass           0.0
PassengerId      0.0
dtype: float64

#### Cabin

In [17]:
df_train.drop("Cabin", axis=1, inplace=True)
df_test.drop("Cabin", axis=1, inplace=True)

#### Age

#### Embarked

### Converting to Numeric Value