In [1]:
import pandas as pd
import numpy as np

titanic_survival = pd.read_csv("titanic_train.csv")
print(titanic_survival.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [2]:
# is_null
age = titanic_survival["Age"]
print(age.loc[0:10])

print('----------------------')
age_is_null = pd.isnull(age)
print (age_is_null.loc[0:10])  # True / False

print('----------------------')
age_null_true = age[age_is_null]
print( age_null_true.loc[0:10]) #print NaN 

print('----------------------')
age_null_count = len(age_null_true)
print(age_null_count)

0     22.0
1     38.0
2     26.0
3     35.0
4     35.0
5      NaN
6     54.0
7      2.0
8     27.0
9     14.0
10     4.0
Name: Age, dtype: float64
----------------------
0     False
1     False
2     False
3     False
4     False
5      True
6     False
7     False
8     False
9     False
10    False
Name: Age, dtype: bool
----------------------
5   NaN
Name: Age, dtype: float64
----------------------
177


In [3]:
# The result of this is that mean_age would be nan. need to be fixed
# ignored all the NaN value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print (mean_age) #NaN

#filter out the missing value

age_is_null = titanic_survival["Age"].isnull()
filted = titanic_survival['Age'][age_is_null == False]
mean = sum(filted)/len(filted)
print(mean)

nan
29.69911764705882


In [4]:
# more easy way to cal the mean value
easy_mean_age = titanic_survival['Age'].mean()
easy_mean_age

29.69911764705882

In [5]:
#mean fare for passager class 'Pclass'
#Calculate the mean price for each PClass
passenger_classes = [1, 2, 3]
classes_fare = {}
for pc in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival['Pclass']==pc]
    pclass_fares = pclass_rows['Fare'] #get the price in particular row
    fare_for_class = pclass_fares.mean()
    classes_fare[pc]= fare_for_class

print(classes_fare)

{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}


In [6]:
# more easy way to Calculate the mean price for each PClass
# pivot_table
passenger_Fares = titanic_survival.pivot_table(index="Pclass", values="Fare", aggfunc=np.mean)
print (passenger_Fares)

print('-------------------')
# average survival passagers & age in each Pclass,aggfunc default = np.mean
passenger_survival = titanic_survival.pivot_table(index="Pclass", values=["Survived",'Age'])
print (passenger_survival)

             Fare
Pclass           
1       84.154687
2       20.662183
3       13.675550
-------------------
              Age  Survived
Pclass                     
1       38.233441  0.629630
2       29.877630  0.472826
3       25.140620  0.242363


In [7]:
# drop NaN value
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])

print(new_titanic_survival[0:10])

    PassengerId  Survived  Pclass  \
0             1         0       3   
1             2         1       1   
2             3         1       3   
3             4         1       1   
4             5         0       3   
6             7         0       1   
7             8         0       3   
8             9         1       3   
9            10         1       2   
10           11         1       3   

                                                 Name     Sex   Age  SibSp  \
0                             Braund, Mr. Owen Harris    male  22.0      1   
1   Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                              Heikkinen, Miss. Laina  female  26.0      0   
3        Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                            Allen, Mr. William Henry    male  35.0      0   
6                             McCarthy, Mr. Timothy J    male  54.0      0   
7                      Palsson, Master. Gosta 

In [8]:
# Loacation to a specific value, dp.loc[row,column]
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print (row_index_83_age)
print (row_index_1000_pclass)

28.0
1


In [9]:
# reset the index for each row
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False) #ranked the data accoriding to Age
print( new_titanic_survival[0:10])

print('-------------------------------------------')
#re-index the whole data set, drop= drop the original data index
titanic_reindexed = new_titanic_survival.reset_index(drop=True) 
print(titanic_reindexed.iloc[0:10])


     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0     

In [10]:
# apply function to DF.column
def hundredth_row(column):
    # Extract the hundredth item
    hundredth_item = column.iloc[99]
    return hundredth_item  #returns the hundredth item from a series

hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)

PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object


In [11]:
# return total number of NaN for each column
def null_count(column):
    column_null = pd.isnull(column) #boolean, null = true
    null = column[column_null] #collection of null value
    return len(null)

column_null_count = titanic_survival.apply(null_count)
print (column_null_count)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [12]:
# Function

def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"


classes = titanic_survival.apply(which_class, axis=1)
print(classes[0:10])


0     Third Class
1     First Class
2     Third Class
3     First Class
4     Third Class
5     Third Class
6     First Class
7     Third Class
8     Third Class
9    Second Class
dtype: object
