# Data Visualization

In [1]:
#import required libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
!pip install openpyxl



In [3]:
#LOAD DATA
df=pd.read_csv("../data/gapminder.csv")
titanic=pd.read_excel("../data/titanic.xlsx")
join_data1=pd.read_csv("../data/joining_data1.csv")
join_data2=pd.read_csv("../data/joining_data2.csv")
union_data=pd.read_csv("../data/union_data.csv")

In [4]:
join_left=pd.merge(left=join_data1,
                  right=join_data2,
                   how="left",
                   on="name"
                  )

In [5]:
join_right=pd.merge(left=join_data1,
                  right=join_data2,
                   how="right",
                   on="name"
                  )

In [6]:
join_left

Unnamed: 0,name,animal,age,vaccinated
0,Catalie Portman,cat,3,True
1,Pico de Gato,cat,5,True
2,Chewbarka,dog,1,False
3,Sir Isaac Mewton,cat,7,True
4,K9,cat,11,False
5,Arf Vader,dog,6,


In [7]:
join_right

Unnamed: 0,name,animal,age,vaccinated
0,Catalie Portman,cat,3.0,True
1,Pico de Gato,cat,5.0,True
2,Chewbarka,dog,1.0,False
3,Sir Isaac Mewton,cat,7.0,True
4,K9,cat,11.0,False
5,Spiderpig,,,True


# Aggregation

In [8]:
titanic.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'home.dest', 'body', 'boat'],
      dtype='object')

In [9]:
titanic["sex"].unique()

array(['female', 'male'], dtype=object)

In [10]:
#count passengers by sex
titanic["sex"].value_counts()

sex
male      843
female    466
Name: count, dtype: int64

In [11]:
#check survival by sex
titanic.groupby(["sex"])["survived"].agg(["sum","mean"])

Unnamed: 0_level_0,sum,mean
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,339,0.727468
male,161,0.190985


## Pivot Table

In [12]:
titanic_pivot=titanic.pivot_table(values="fare",index="sex",aggfunc='mean')
titanic_pivot

Unnamed: 0_level_0,fare
sex,Unnamed: 1_level_1
female,46.198097
male,26.154601


In [13]:
titanic_copy=titanic.copy()

In [14]:
titanic_copy.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'home.dest', 'body', 'boat'],
      dtype='object')

In [15]:
titanic_copy.rename(columns={'sex':'gender'},inplace=True)

In [16]:
titanic_copy.columns

Index(['pclass', 'survived', 'name', 'gender', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked', 'home.dest', 'body', 'boat'],
      dtype='object')

In [17]:
titanic_copy=titanic_copy.columns.str.capitalize()
titanic_copy

Index(['Pclass', 'Survived', 'Name', 'Gender', 'Age', 'Sibsp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Home.dest', 'Body', 'Boat'],
      dtype='object')

In [18]:
titanic_copy.groupby("gender")["fare"].agg(['mean','max','min'])

TypeError: Categorical input must be list-like

In [None]:
titanic_copy.iloc[[0]]

In [None]:
#rows,columns
titanic_copy.iloc[[1,4],[1,3]]

In [None]:
titanic_copy.loc[0:5]

In [None]:
titanic_copy.tail()

## iloc

In [None]:
titanic_copy.iloc[0]

In [None]:
titanic_copy.iloc[[0]]

In [None]:
titanic_copy.iloc[[0,2]]

In [None]:
titanic_copy.iloc[0,5]

In [None]:
titanic_copy.loc[0]

In [None]:
titanic_copy.loc[[0]]

In [None]:
titanic_copy.loc[0:2]

In [None]:
titanic_copy.loc[0:5,["fare"]]

In [None]:
titanic_copy.iloc[10:15]

# sort_values

In [None]:
join_data1.sort_values(by="age",ascending=True)

In [None]:
join_data2.sort_values(by="vaccinated",ascending=True)

In [None]:
join_data1["name"].unique()

In [None]:
join_data2["name"].unique()

In [None]:
titanic_copy.columns

In [None]:
sns.countplot(x="survived",data=titanic_copy,color="pink",label="survived")
plt.title("Survival Count",color="skyblue")
plt.legend();

In [None]:
titanic_copy["age"].hist(bins=10)
plt.title("Age Distribution");

In [None]:
titanic_copy["fare"].hist(bins=30)
plt.title("Fare Distribution");

In [None]:
titanic_copy["pclass"].hist(bins=30)
plt.title("passenger class distribution");

In [None]:
sns.boxplot(x="pclass",y="age",data=titanic_copy)
plt.title("Age Distribution by class");

In [None]:
sns.heatmap(titanic.isnull(),cbar=True,cmap="viridis")
plt.title("Missing Values Heatmap");

In [None]:
#titanic_copy=titanic_copy.columns.str.upper()
#titanic_copy.columns

In [None]:
#titanic_copy.groupby("gender")["fare"].agg(['mean','max','min'])

# Assignment
1. Does the ticket fare, passenger, family onboard(parch) class afffect chances survival?
2. 

In [None]:
corr=titanic[["fare","parch","pclass","survived"]].corr()
#titanic["ticket"].unique
corr

In [None]:
sns.heatmap(data=corr,annot=True);

In [None]:
sns.relplot(
    data=corr,
    x="fare",
    y='survived',
    #hue="gdp_per_cap",
    #size="pop",
    #sizes=(100,1000)
);


In [None]:
sns.relplot(
    data=corr,
    x="parch",
    y='survived',
    #hue="gdp_per_cap",
    #size="pop",
    #sizes=(100,1000)
);

In [None]:
sns.relplot(
    data=corr,
    x="fare",
    y='pclass',
    #hue="gdp_per_cap",
    #size="pop",
    #sizes=(100,1000)
);

In [None]:
df.head()


In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df["year"].max()

## Introducing data vis

The following cells introduce diffferent plot types

In [None]:
x=np.linspace(0,0.3,12)
y=x**2+2
plt.plot(x,y)
plt.xlabel("random numbers")
plt.ylabel("function of random numbers")
plt.title("function vs random nums",color="blue")
plt.show()

In [None]:
#scatter plot
x=[5,7,8,9,7,9,8,8,9]
y=[71,79,98,89,90,78,80,70,87]
plt.scatter(x,y)
plt.show()

In [None]:
#bar graph
categories=['A','B','C','D']
values=3,6,4,3
plt.bar(categories,values)
plt.show()

In [None]:
data=np.random.normal(0,1,100)
plt.hist(data,bins=10,color="green")
plt.show()

In [None]:

# Create subplots (2 rows, 2 columns)
fig, axs = plt.subplots(2, 2, figsize=(8, 6))

#fig 1
x=np.linspace(0,0.3,12)
y=x**2+2
#plt.plot(x,y)
axs[0, 0].plot(x, y)
axs[0, 0].set_title("line graph ",color="blue")
axs[0, 0].set_ylabel("y",color="blue")
axs[0, 0].set_xlabel("x")

#fig 2
x1=[1,3,6,8,4,7,9,4,6,7]
y1=[10,33,63,89,45,78,90,45,65,71]
axs[0, 1].scatter(x1, y1, color='blue')
axs[0, 1].set_title("Scatter_practice",color="blue")
axs[0, 1].set_xlabel("x1",color="blue")
axs[0, 1].set_ylabel("y1",color="blue")


#fig 3
categories=['A','B','C','D']
values=3,6,4,3
axs[1, 0].bar(categories,values, color='skyblue')
axs[1, 0].set_title("Bar graph",color="blue")
axs[1, 0].set_xlabel("categories",color="blue")
axs[1, 0].set_ylabel("values",color="blue")

#fig 4
data=np.random.normal(0,1,100)
axs[1, 1].hist(data,bins=10, color='skyblue')
axs[1, 1].set_title("histogram ",color="blue")
axs[1, 1].set_xlabel("x ",color="blue")
axs[1, 1].set_ylabel("y ",color="blue")

# layout
plt.tight_layout()
plt.show()

## Scatter plots

In [None]:
#gapminder data
df.info()

In [None]:
df["year"].unique()

In [None]:
#extract data of 2002
df_2002=df[df["year"]==2002]

In [None]:
df_2002.head()

In [None]:
df[df["country"]=="Kenya"].info()

In [None]:
#extract data for multiple conditions
#df_mask=(df[df["country"]=="Kenya"]) & (df[df["year"]=="2002"])
#(df[df["year"]=="2002"])

In [None]:
#create a figure object
fig,axes=plt.subplots(figsize=(8,6))
x=df_2002["gdp_per_cap"]
y=df_2002['life_exp']
data=(x,y)
sns.scatterplot(data)
plt.xlabel("gdp_per_cap",color="blue")
plt.ylabel("life_exp",color="blue")
plt.title("Life expectancy as a function of GDP per capita for 2002 ",color="blue")
plt.legend()
plt.show()

In [None]:
#create a figure object
fig,axes=plt.subplots(figsize=(8,6))
x=df_2002["gdp_per_cap"]
y=df_2002['life_exp']
gdp=df_2002["gdp_per_cap"]
size=(gdp-gdp.min())/gdp.max()*300+20
plt.scatter(x,y,s=size,label="kenya")
plt.xlabel("gdp_per_cap",color="blue")
plt.ylabel("life_exp",color="blue")
plt.title("Life expectancy as a function of GDP per capita for 2002 ",color="skyblue")
plt.legend()
plt.show()

In [None]:
mask_kenya=df[df["country"]=="Kenya"]

In [None]:
x=mask_kenya["gdp_per_cap"]
y=mask_kenya["life_exp"]
for year in mask_kenya["year"]:
    mask_kenya[mask_kenya["year"]==year]
    plt.scatter(x,y,label=year)
    
sizes=mask_kenya["gdp_per_cap"]/100
#plt.scatter(x,y,s=sizes,label="kenya")
plt.xlabel("gdp_per_cap",color="blue")
plt.ylabel("life_exp",color="blue")
plt.title("Life expectancy as a function of GDP per capita for Kenya ",color="blue")
plt.legend()
plt.show()

In [None]:
#save the figure to file
fig.savefig("../data/gapminder_2002.png")

In [None]:
sns.relplot(
    data=df_2002,
    x="gdp_per_cap",
    y='life_exp',
    hue="continent",
    size="pop",
    sizes=(100,1000)
);


In [None]:
sns.relplot(
    data=mask_kenya,
    x="year",
    y='life_exp',
    hue="gdp_per_cap",
    size="pop",
    sizes=(100,1000)
);