In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
titanic = sns.load_dataset("titanic")
df = titanic.copy()
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# Find the number of male and female passengers
df["sex"].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [4]:
# Find the number of unique values for each column
df.nunique()

survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64

In [5]:
# Find the number of unique values of the pclass variable
df["pclass"].nunique()

3

In [6]:
# Find the number of unique values of the pclass and parch variables
df[["pclass", "parch"]].nunique()

pclass    3
parch     7
dtype: int64

In [7]:
# Check the type of the embarked variable. Change its type to category and check again
df["embarked"].dtypes
df["embarked"] = df["embarked"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    category
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(3), float64(2), int64(4), object(4)
memory usage: 74.7+ KB


In [8]:
# Show all the wisdoms of those whose embarked value is C
df["embarked"].value_counts()  
df[df["embarked"] == "C"]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
19,1,3,female,,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
30,0,1,male,40.0,0,0,27.7208,C,First,man,True,,Cherbourg,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,1,2,female,27.0,1,0,13.8583,C,Second,woman,False,,Cherbourg,yes,False
874,1,2,female,28.0,1,0,24.0000,C,Second,woman,False,,Cherbourg,yes,False
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False


In [9]:
# Show all the information of those whose embarked value is not S
df["embarked"].value_counts() 
df[df["embarked"] != "S"]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
16,0,3,male,2.0,4,1,29.1250,Q,Third,child,False,,Queenstown,no,False
19,1,3,female,,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [10]:
# Show all information for passengers under 30 years of age and female
df[(df["sex"] == "female") & (df["age"] < 30)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
14,0,3,female,14.0,0,0,7.8542,S,Third,child,False,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,1,2,female,28.0,1,0,24.0000,C,Second,woman,False,,Cherbourg,yes,False
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True


In [11]:
# Mouse Show information of passengers older than 500 or older than 70
df[(df["fare"] > 500) | (df["age"] > 70)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True


In [12]:
# Find the sum of the null values in each variable
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [13]:
# Enter the who variable from the dataframe
df.drop("who", axis=1, inplace=True)

In [14]:
# Fill in the empty values of the deck variable with the most recurring value (mode) of the deck variable
df["deck"].fillna(df["deck"].mode()[0], inplace=True)
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
adult_male       0
deck             0
embark_town      2
alive            0
alone            0
dtype: int64

In [15]:
# Write a function that will return 1 to those under 30 and 0 to those above or equal to 30. Titanic data using the function you wrote
# Create a variable named age_flag in the set (Use apply and lambda structures)

def age_flag(x):
    if x < 30:
        return 1
    else:
        return 0


df["age_flag"] = df["age"].apply(lambda x: age_flag(x))
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,adult_male,deck,embark_town,alive,alone,age_flag
0,0,3,male,22.0,1,0,7.2500,S,Third,True,C,Southampton,no,False,1
1,1,1,female,38.0,1,0,71.2833,C,First,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.9250,S,Third,False,C,Southampton,yes,True,1
3,1,1,female,35.0,1,0,53.1000,S,First,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.0500,S,Third,True,C,Southampton,no,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,True,C,Southampton,no,True,1
887,1,1,female,19.0,0,0,30.0000,S,First,False,B,Southampton,yes,True,1
888,0,3,female,,1,2,23.4500,S,Third,False,C,Southampton,no,False,0
889,1,1,male,26.0,0,0,30.0000,C,First,True,C,Cherbourg,yes,True,1


In [16]:
# Define the tips data set
tips = sns.load_dataset("tips")
df = tips.copy()
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [17]:
# Find the sum, min, max and average of total_bill values according to the categories (Dinner, Lunch) of the Time variable
df.groupby("time")["total_bill"].agg(["sum", "min", "max", "mean"])

Unnamed: 0_level_0,sum,min,max,mean
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Lunch,1167.47,7.51,43.11,17.168676
Dinner,3660.3,3.07,50.81,20.797159


In [18]:
# Find the sum, min, max and average of total_bill values according to days and time
df.groupby(["day", "time"])["total_bill"].agg(["sum", "min", "max", "mean"])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,min,max,mean
day,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Thur,Lunch,1077.55,7.51,43.11,17.664754
Thur,Dinner,18.78,18.78,18.78,18.78
Fri,Lunch,89.92,8.58,16.27,12.845714
Fri,Dinner,235.96,5.75,40.17,19.663333
Sat,Lunch,0.0,,,
Sat,Dinner,1778.4,3.07,50.81,20.441379
Sun,Lunch,0.0,,,
Sun,Dinner,1627.16,7.25,48.17,21.41


In [19]:
# Find the sum, min, max and average of the total_bill and tip values of the lunch time and female customers according to day
# Filter for women and lunch
df.loc[(df["sex"] == "Female") & (df["time"] == "Lunch")].groupby("day")[
    "total_bill", "tip"].agg(["sum", "min", "max", "mean"])

  df.loc[(df["sex"] == "Female") & (df["time"] == "Lunch")].groupby("day")[


Unnamed: 0_level_0,total_bill,total_bill,total_bill,total_bill,tip,tip,tip,tip
Unnamed: 0_level_1,sum,min,max,mean,sum,min,max,mean
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Thur,516.11,8.35,43.11,16.64871,79.42,1.25,5.17,2.561935
Fri,55.76,10.09,16.27,13.94,10.98,2.0,3.48,2.745
Sat,0.0,,,,0.0,,,
Sun,0.0,,,,0.0,,,


In [20]:
# What is the average of orders with size i less than 3 and total_bill greater than 10? (use loc)
df.loc[(df["size"] < 3) & (df["total_bill"] > 10)].mean()

  df.loc[(df["size"] < 3) & (df["total_bill"] > 10)].mean()


total_bill    17.184965
tip            2.638811
size           1.993007
dtype: float64

In [21]:
# total_bill_tip_sum adinda yeni bir degisken olusturunuz. Her bir müsterinin ödedigi totalbill ve tip in toplamini versin
df["total_bill_tip_sum"] = df["total_bill"] + df["tip"]
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_bill_tip_sum
0,16.99,1.01,Female,No,Sun,Dinner,2,18.00
1,10.34,1.66,Male,No,Sun,Dinner,3,12.00
2,21.01,3.50,Male,No,Sun,Dinner,3,24.51
3,23.68,3.31,Male,No,Sun,Dinner,2,26.99
4,24.59,3.61,Female,No,Sun,Dinner,4,28.20
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,34.95
240,27.18,2.00,Female,Yes,Sat,Dinner,2,29.18
241,22.67,2.00,Male,Yes,Sat,Dinner,2,24.67
242,17.82,1.75,Male,No,Sat,Dinner,2,19.57


In [22]:
# Sort from largest to smallest according to the total_bill tip_sum variable and assign the first 30 people to a new dataframe
df_new = df.sort_values("total_bill_tip_sum", ascending=False).head(30)
df_new

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_bill_tip_sum
170,50.81,10.0,Male,Yes,Sat,Dinner,3,60.81
212,48.33,9.0,Male,No,Sat,Dinner,4,57.33
59,48.27,6.73,Male,No,Sat,Dinner,4,55.0
156,48.17,5.0,Male,No,Sun,Dinner,6,53.17
182,45.35,3.5,Male,Yes,Sun,Dinner,3,48.85
197,43.11,5.0,Female,Yes,Thur,Lunch,4,48.11
23,39.42,7.58,Male,No,Sat,Dinner,4,47.0
102,44.3,2.5,Female,Yes,Sat,Dinner,3,46.8
142,41.19,5.0,Male,No,Thur,Lunch,5,46.19
95,40.17,4.73,Male,Yes,Fri,Dinner,4,44.9
