In [106]:
# NumPy
import numpy as np

# Dataframe operations
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [107]:
# 1. Loading training and testing datasets and merging them:
train_df = pd.read_csv('train.csv')[['Name', 'Age']]
test_df = pd.read_csv('test.csv')[['Name', 'Age']]
data = train_df.append(test_df)

# 2. Loading real data:
real_data = pd.read_excel('real_data.xls')[['age']]
print("AGE NaN count:", real_data['age'].isnull().sum())

AGE NaN count: 263


In [48]:
title_feature = pd.DataFrame()
title_feature['Title'] = data['Name']
for name_string in data['Name']:
    title_feature['Title']=data.Name.str.extract('([A-Za-z]+)\.', expand=True)

mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
title_feature.replace({'Title': mapping}, inplace=True)

In [49]:
title_feature['Age'] = data['Age']
title_feature.groupby('Title')['Age'].mean()

Title
Dr        43.571429
Master     5.482642
Miss      21.834533
Mr        32.569374
Mrs       37.046243
Rev       41.250000
Name: Age, dtype: float64

In [97]:
temp = pd.DataFrame()
temp['Title'] = title_feature['Title']
temp['Age'] = data['Age'] 

# Forming titles DFs
titles = list(set(temp['Title'].tolist())) # List of all our chosen titles
l = []
for t in titles:
    l.append(temp.loc[(temp.Age.notnull()) & (temp['Title']==t)])

# Note that we perform imputation on TRAIN_DF now!
data['Title'] = title_feature['Title']
data.loc[(data['Age'].isnull()) & (data['Title']=='Dr'),'Age']     = 46.5
data.loc[(data['Age'].isnull()) & (data['Title']=='Master'),'Age'] = 3.5
data.loc[(data['Age'].isnull()) & (data['Title']=='Miss'),'Age']   = 21.5
data.loc[(data['Age'].isnull()) & (data['Title']=='Mr'),'Age']     = 30.0
data.loc[(data['Age'].isnull()) & (data['Title']=='Mrs'),'Age']    = 35.9
data.loc[(data['Age'].isnull()) & (data['Title']=='Rev'),'Age']    = 46.5

# Dropping Title feature for now - because maybe it will not be needed at all, who knows.
data.drop('Title', axis=1, inplace=True)

In [98]:
print(data.tail(10), '\n', real_data.tail(10))
print(real_data.shape)

                                                Name   Age
408                  Riordan, Miss. Johanna Hannah""  21.5
409                        Peacock, Miss. Treasteall   3.0
410                           Naughton, Miss. Hannah  21.5
411  Minahan, Mrs. William Edward (Lillian E Thorpe)  37.0
412                   Henriksson, Miss. Jenny Lovisa  28.0
413                               Spector, Mr. Woolf  30.0
414                     Oliva y Ocana, Dona. Fermina  39.0
415                     Saether, Mr. Simon Sivertsen  38.5
416                              Ware, Mr. Frederick  30.0
417                         Peter, Master. Michael J   3.5 
        age
1299  27.0
1300  15.0
1301  45.5
1302   NaN
1303   NaN
1304  14.5
1305   NaN
1306  26.5
1307  27.0
1308  29.0
(1309, 1)


In [93]:
z = real_data['age'] - data['Age']

z

0        7.0000
0       -5.5000
1      -37.0833
1      -46.0833
2      -24.0000
2      -60.0000
3       -5.0000
3        3.0000
4      -10.0000
4        3.0000
5       18.0000
5       34.0000
6        9.0000
6       33.0000
7       37.0000
7       13.0000
8       26.0000
8       35.0000
9       57.0000
9       50.0000
10      43.0000
10      17.0000
11     -40.0000
11     -28.0000
12       4.0000
12       1.0000
13     -13.0000
13     -37.0000
14      66.0000
14      33.0000
         ...   
1279        NaN
1280        NaN
1281        NaN
1282        NaN
1283        NaN
1284        NaN
1285        NaN
1286        NaN
1287        NaN
1288        NaN
1289        NaN
1290        NaN
1291        NaN
1292        NaN
1293        NaN
1294        NaN
1295        NaN
1296        NaN
1297        NaN
1298        NaN
1299        NaN
1300        NaN
1301        NaN
1302        NaN
1303        NaN
1304        NaN
1305        NaN
1306        NaN
1307        NaN
1308        NaN
Length: 1727, dtype: flo