# Titanic Dataset

In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


##### How many male and female passengers were onboard

In [2]:
female = len(df[df["Sex"] == "female"])
male = len(df[df["Sex"] == "male"])

print("Number of females: ", female)
print("Number of males: ", male)


Number of females:  314
Number of males:  577


##### How many survivors we have

In [3]:
_surv = len(df[df["Survived"] == 1])
print("Number of survivors: ", _surv)

Number of survivors:  342


##### How many casualities

In [4]:
_nonsurv = len(df[df["Survived"] == 0])
print("Number of Casualities: ", _nonsurv)

Number of Casualities:  549


##### Name of eldest person/s

In [5]:
eldest = max(df["Age"])
_eldest_name = df[df["Age"] == eldest]["Name"]

print("Name of eldest person: ",_eldest_name)

Name of eldest person:  630    Barkworth, Mr. Algernon Henry Wilson
Name: Name, dtype: object


##### Number of passengers in each class

In [7]:
# Get unique occurences from the Pclass first
df["Pclass"].unique()

_1st  = len(df[df["Pclass"] == 1])
_2nd = len(df[df["Pclass"] == 2])
_3rd  = len(df[df["Pclass"] == 3])

print("Number of passengers in 1st class: ",_1st)
print("Number of passengers in 2nd class: ",_2nd)
print("Number of passengers in 3rd class: ",_3rd)

Number of passengers in 1st class:  216
Number of passengers in 2nd class:  184
Number of passengers in 3rd class:  491


##### Number of persons whose name starts with "s"

In [8]:
count = df.Name.str.startswith("S").sum()

print("Number of passengers whose name starts with S is ",count)

Number of passengers whose name starts with S is  86


##### Create a new col which is a summation of "Sibsp" and "Parch"

In [9]:
df['Total'] = df['SibSp'] + df['Parch']

# Shifting the position of the column beside the added columns
cols = list(df.columns)
df = df[cols[0:8] + [cols[-1]]+cols[8:12]]

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Total,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,1,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,1,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,1,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,3,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,0,111369,30.0000,C148,C


##### How many persons do we have below age of 25

In [10]:
_age = len(df[df["Age"] < 25.0])

print("Number of people below 25 yrs age: ",_age)

Number of people below 25 yrs age:  278


##### How many persons died whose age was less than 40

In [11]:
# Create a separate dataframe out of col = Survived and age

df1 = df[df["Survived"] == 0]

df1 = df1[df1["Age"] < 40.0]

print("Number of persons who died below age of 40 are: ", len(df1))

Number of persons who died below age of 40 are:  322


##### From Cabin column separate numerical values

In [13]:
# New column name is 'Cabin_split'
df["Cabin_split"] = df["Cabin"].str.replace('([A-Z]+)', '')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Total,Ticket,Fare,Cabin,Embarked,Cabin_split
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,1,A/5 21171,7.2500,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,1,PC 17599,71.2833,C85,C,85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,0,STON/O2. 3101282,7.9250,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,1,113803,53.1000,C123,S,123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,0,373450,8.0500,,S,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,0,211536,13.0000,,S,
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,0,112053,30.0000,B42,S,42
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,3,W./C. 6607,23.4500,,S,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,0,111369,30.0000,C148,C,148


In [2]:
import pandas as pd

# Bank Data Dataset

In [3]:
df = pd.read_csv("bank-full.csv",delimiter = ";")
df.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no


##### How many campaign available in the dataset

In [29]:
_sum = df["campaign"].sum()

print("Total number of campaigns conducted so far are ",_sum)

Total number of campaigns conducted so far are  124956


##### How many users with housing and personal loan

In [15]:
_housing_loan = df[df["housing"] == "yes"]

print("Number of housing loans: ",len(_housing_loan))

_personal_loan = df[df["loan"] == "yes"]

print("Number of personal loans: ",len(_personal_loan))

Number of housing loans:  25130
Number of personal loans:  7244


##### How many persons with age>60

In [16]:
_senior = df[df["age"] > 60]

print("Number of senior citizes: ",len(_senior))

Number of senior citizes:  1188


##### In which month were most cutomers targeted

In [13]:
d = dict(df["month"].value_counts())

_max = max(d.values())
#print(_max)
for key,value in d.items():
    if _max == value:
        print("Most campaigns done in the month of", key)

Most campaigns done in the month of may


##### Which mode of call gives better results

In [5]:
df_groupby = df.groupby(["contact", "y"]).count()
df_groupby

Unnamed: 0_level_0,Unnamed: 1_level_0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,poutcome
contact,y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
cellular,no,24916,24916,24916,24916,24916,24916,24916,24916,24916,24916,24916,24916,24916,24916,24916
cellular,yes,4369,4369,4369,4369,4369,4369,4369,4369,4369,4369,4369,4369,4369,4369,4369
telephone,no,2516,2516,2516,2516,2516,2516,2516,2516,2516,2516,2516,2516,2516,2516,2516
telephone,yes,390,390,390,390,390,390,390,390,390,390,390,390,390,390,390
unknown,no,12490,12490,12490,12490,12490,12490,12490,12490,12490,12490,12490,12490,12490,12490,12490
unknown,yes,530,530,530,530,530,530,530,530,530,530,530,530,530,530,530


In [10]:
cell = df_groupby.loc["cellular", "yes"][0]
tele = df_groupby.loc["telephone", "yes"][0]
unkwn = df_groupby.loc["unknown", "yes"][0]

print(cell, tele, unkwn)

4369 390 530


##### How many entrepreneurs are there in the list

In [18]:
_en = df[df["job"] == "entrepreneur"]

print("Total number of entrepreneurs: ",len(_en))


Total number of entrepreneurs:  1487


##### How many customers have negative balance

In [25]:
_neg = df[df["balance"]  < 0]
print("Total number of customers haveing negative balance: ",len(_neg))

Total number of customers haveing negative balance:  3766


##### Prepare group of data based on education level

In [26]:
df["education"].unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [28]:
_primary = df[df["education"] == "primary"]
_primary

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
8,58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
15,51,retired,married,primary,no,229,yes,no,unknown,5,may,353,1,-1,0,unknown,no
17,57,blue-collar,married,primary,no,52,yes,no,unknown,5,may,38,1,-1,0,unknown,no
18,60,retired,married,primary,no,60,yes,no,unknown,5,may,219,1,-1,0,unknown,no
22,32,blue-collar,single,primary,no,23,yes,yes,unknown,5,may,160,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45152,61,housemaid,married,primary,no,182,no,no,cellular,9,nov,335,1,92,13,failure,no
45154,63,retired,married,primary,no,3738,no,no,telephone,9,nov,301,1,456,4,failure,no
45170,19,student,single,primary,no,245,no,no,telephone,10,nov,98,2,110,2,other,no
45183,70,retired,married,primary,no,324,no,no,cellular,15,nov,78,1,96,7,success,no


In [29]:
_secondary = df[df["education"] == "secondary"]
_secondary

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
9,43,technician,single,secondary,no,593,yes,no,unknown,5,may,55,1,-1,0,unknown,no
10,41,admin.,divorced,secondary,no,270,yes,no,unknown,5,may,222,1,-1,0,unknown,no
11,29,admin.,single,secondary,no,390,yes,no,unknown,5,may,137,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45204,73,retired,married,secondary,no,2850,no,no,cellular,17,nov,300,1,40,8,failure,yes
45205,25,technician,single,secondary,no,505,no,yes,cellular,17,nov,386,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [30]:
_tertiary = df[df["education"] == "tertiary"]
_tertiary

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
21,56,management,married,tertiary,no,779,yes,no,unknown,5,may,164,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45194,59,management,married,tertiary,no,138,yes,yes,cellular,16,nov,162,2,187,5,failure,no
45198,37,management,married,tertiary,no,1428,no,no,cellular,16,nov,333,2,-1,0,unknown,no
45201,53,management,married,tertiary,no,583,no,no,cellular,17,nov,226,1,184,4,success,yes
45203,23,student,single,tertiary,no,113,no,no,cellular,17,nov,266,1,-1,0,unknown,yes
