# Topic 3: Descriptive Statistics

In [1]:
#Import Required Packages

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Pulling the data

data = pd.read_excel(r'C:\Users\DDEBAPRI\Desktop\Topic_3.xlsx', sheet_name = 'Final_Data')

In [3]:
#Viewing top 5 rows

data.head(5)

Unnamed: 0,model,income,age,education,gender,marital_status,states,date,amount
0,F-SERIES SUPER DUTY,5.0,,,M,M,OR,2020-02-04,
1,EXPLORER,,,4.0,F,M,FL,2020-03-19,
2,ESCAPE,3.0,110.0,0.0,M,S,NY,2020-03-11,500.0
3,FUSION,,108.0,5.0,M,M,FL,2020-03-23,286.0
4,F-SERIES,3.0,19.0,0.0,F,S,PA,2020-05-21,550.0


# Checking nulls and zeros and removing them

In [4]:
#Checking the non-null counts and variable types

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 886 entries, 0 to 885
Data columns (total 9 columns):
model             886 non-null object
income            553 non-null float64
age               680 non-null float64
education         699 non-null float64
gender            886 non-null object
marital_status    886 non-null object
states            886 non-null object
date              886 non-null datetime64[ns]
amount            884 non-null float64
dtypes: datetime64[ns](1), float64(4), object(4)
memory usage: 62.4+ KB


In [None]:
#Counting null values for all the columns
#axis = 0 implies summing them across columns

mis_val = data.isnull().sum(axis=0).sort_values(ascending = False).rename('Missing Values')
mis_val

In [None]:
#Counting null values for all the columns in percentage

mis_val_percent = (100 * data.isnull().sum() / len(data)).round(2).rename('% of Missing Values')
mis_val_percent

In [None]:
#Counting zero values for all the columns

zero_val = (data == 0).astype(int).sum().rename('zero_val').rename('Zero Values')

In [None]:
#Counting zero values for all the columns in percentage

zero_val_percent = (100 * (data == 0).astype(int).sum()/ len(data)).round(2).rename('% of Zero Values')

In [None]:
#Creating a table to show the variables created above

table = pd.concat([zero_val,zero_val_percent, mis_val, mis_val_percent], axis=1)
table

In [None]:
#Replacing 0 with Null for variables where having a zero value does not make sense

data['age'].replace(0, np.nan, inplace = True)
data.info()

In [None]:
#Drop the rows with Null values and check

data1 = data.dropna()
data1.info()

In [None]:
#Change the data types according to business definition

data1['income'] = data1['income'].astype(str)
data1['education'] = data1['education'].astype(str)
data1['age'] = data1['age'].astype(int)
data1['amount'] = data1['amount'].astype(int)

#Converting to string

data2 = data1

from datetime import datetime as dt

#data2['date'] = data2['date'].dt.strftime("%m-%d-%Y")
#data2[['date']]

#add keeping filter for dates


#Converting back to datetime
#data2['date'] = pd.to_datetime(data2['date'], format="%m-%d-%Y")
#data2[['date']]

data1.info()

In [None]:
#Viewing top 5 rows

#Reseting the index of the data
data1 = data1.reset_index(drop = True)

#If you want the indexing to start from 1
#data1.index = data1.index + 1

data1.head()

# Checking basic Central Tendency measures

In [None]:
#By default checks Count, Mean, Std. Deviation, Min, Max, 25th, 50th(median), 75th Percentiles for Numerical Variables

#Percentile list to be shown
perc = [.01,.25,.75, .99]

data1.describe(percentiles = perc, include = 'all').round().T

In [None]:
#Checking distribution for Categorical Variables

income = data1['income'].value_counts().rename('income')
income.plot(kind = 'bar')

In [None]:
#Checking Mode of Numerical Variables

mode = data1[['age']].mode()
mode

#Cross checking the mode
#data1['age'].value_counts()

In [None]:
#Calling reqd. packages

from scipy.stats import norm

#Checking Skewness and Kurtosis

skew, kurt = norm.stats(moments='sk')

skew = data1[['age']].skew().rename('skewness')
kurt = data1[['age']].kurt().rename('kurtosis')
desc_table = pd.concat([skew,kurt], axis = 1)
desc_table

# Checking normality for numerical variables

#### Histogram

In [None]:
#Calling reqd. packages

from matplotlib import pyplot

# Histogram plot

pyplot.hist(data1['age'])
pyplot.show()

#### Q-Q Plot

In [None]:
#Calling reqd. packages

from statsmodels.graphics.gofplots import qqplot
from matplotlib import pyplot

# q-q plot
qqplot(data1['age'], line='s')
pyplot.show()

#### Box Plot

In [None]:
#Box Plot

data = pd.DataFrame(data)
data1['age'].plot(kind = 'box')

#### Kolmogorov Smirnov test

In [None]:
#K-S Test for Normality

from scipy.stats import kstest, norm

#Do not forget to mention the mean and the std. dev.
stat, p = kstest(data1['age'],'norm', args=(57,14))
print('Statistics=%.3f, p=%.3f' % (stat, p))

# Interpretation
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

# Checking for Normality for a Non Normal Distribution

In [None]:
#Calling reqd. packages
from scipy.stats import norm

#Checking Skewness and Kurtosis
skew, kurt = norm.stats(moments='sk')

skew = data1[['amount']].skew().rename('skewness')
kurt = data1[['amount']].kurt().rename('kurtosis')
desc_table = pd.concat([skew,kurt], axis = 1)
desc_table

#### Histogram

In [None]:
#Calling reqd. packages
from matplotlib import pyplot

# histogram plot
pyplot.hist(data1['amount'])
pyplot.show()

#### Q-Q Plot

In [None]:
#Calling reqd. packages
from statsmodels.graphics.gofplots import qqplot
from matplotlib import pyplot

# q-q plot

qqplot(data1['amount'], line='s')
pyplot.show()

#### Box Plot

In [None]:
# Box Plot

data = pd.DataFrame(data)
data1['amount'].plot(kind = 'box')

#### Kolmogorov Smirnov test

In [None]:
#K-S Test for Normality

from scipy.stats import kstest, norm

#Do not forget to mention the mean and the std. dev.
stat, p = kstest(data1['amount'],'norm', args=(114,96))
print('Statistics=%.3f, p=%.3f' % (stat, p))

# Interpretation
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

# Thank You!