In [1]:
import pandas as pd
import numpy as np

insur = pd.read_csv('insurance_data_cleaned.csv', index_col='PatientID')
del insur['male']
insur.head()

Unnamed: 0_level_0,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,39,male,23.2,91,Yes,0,No,southeast,1121.87
2,24,male,30.1,87,No,0,No,southeast,1131.51
8,19,male,41.1,100,No,0,No,northwest,1146.8
9,20,male,43.0,86,No,0,No,northwest,1149.4
10,30,male,53.1,97,No,0,No,northwest,1163.46


In [2]:
pd.options.display.float_format = '{:.4f}'.format

# Keysic Descriptive Statistics

In [3]:
# Describe
insur.describe(percentiles = [0.01, 0.09])

Unnamed: 0,age,bmi,bloodpressure,children,claim
count,1332.0,1332.0,1332.0,1332.0,1332.0
mean,38.0863,30.6583,94.1892,1.0998,13325.2464
std,11.1128,6.119,11.4452,1.206,12109.6207
min,18.0,16.0,80.0,0.0,1121.87
1%,18.0,17.931,80.0,0.0,1518.7407
9%,23.0,22.6,82.0,0.0,2244.6912
50%,38.0,30.35,92.0,1.0,9412.965
max,60.0,53.1,140.0,5.0,63770.43


In [4]:
# Mode
insur.mode()

Unnamed: 0,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
0,43.0,male,27.6,91.0,No,0.0,No,southeast,1149.4
1,,,28.9,,,,,,1639.56
2,,,33.3,,,,,,1704.57


In [5]:
# Correlations Matrix
insur.corr()

Unnamed: 0,age,bmi,bloodpressure,children,claim
age,1.0,-0.0422,-0.0603,-0.0262,-0.0286
bmi,-0.0422,1.0,0.145,0.0137,0.1999
bloodpressure,-0.0603,0.145,1.0,-0.0344,0.5313
children,-0.0262,0.0137,-0.0344,1.0,0.0641
claim,-0.0286,0.1999,0.5313,0.0641,1.0


In [6]:
#Variance
insur.var()

age                   123.4944
bmi                    37.4418
bloodpressure         130.9920
children                1.4543
claim           146642913.7950
dtype: float64

In [7]:
# Covariance Matrix
insur.cov()

Unnamed: 0,age,bmi,bloodpressure,children,claim
age,123.4944,-2.8663,-7.6722,-0.3512,-3853.6518
bmi,-2.8663,37.4418,10.1531,0.1012,14815.6062
bloodpressure,-7.6722,10.1531,130.992,-0.4742,73631.3596
children,-0.3512,0.1012,-0.4742,1.4543,936.8056
claim,-3853.6518,14815.6062,73631.3596,936.8056,146642913.795


In [8]:
#Skewness
insur.skew()

age             0.1117
bmi             0.2899
bloodpressure   1.4839
children        0.9326
claim           1.5138
dtype: float64

In [9]:
# Kurtosis
insur.kurt()

age             -0.9502
bmi             -0.0654
bloodpressure    2.8828
children         0.1935
claim            1.5955
dtype: float64

In [10]:
# Combine descriptions with more indexes
insur_des = insur.describe()

age_var = insur.age.var()
bmi_var= insur.bmi.var()
bloodpressure_var =insur.bloodpressure.var()
children_var = insur.children.var()
claim_var = insur.claim.var()

insur_des.loc['variance',:] = [age_var,bmi_var,bloodpressure_var,children_var,claim_var]
insur_des

Unnamed: 0,age,bmi,bloodpressure,children,claim
count,1332.0,1332.0,1332.0,1332.0,1332.0
mean,38.0863,30.6583,94.1892,1.0998,13325.2464
std,11.1128,6.119,11.4452,1.206,12109.6207
min,18.0,16.0,80.0,0.0,1121.87
25%,29.0,26.2,86.0,0.0,4760.1575
50%,38.0,30.35,92.0,1.0,9412.965
75%,47.0,34.725,99.0,2.0,16781.3275
max,60.0,53.1,140.0,5.0,63770.43
variance,123.4944,37.4418,130.992,1.4543,146642913.795


# Crosstab

In [11]:
pd.crosstab(insur['diabetic'], insur['region'], margins=True) 

region,northeast,northwest,southeast,southwest,All
diabetic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No,110,190,225,170,695
Yes,121,155,217,144,637
All,231,345,442,314,1332


In [12]:
pd.crosstab(insur['diabetic'], insur['region'], normalize='index') 

region,northeast,northwest,southeast,southwest
diabetic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.1583,0.2734,0.3237,0.2446
Yes,0.19,0.2433,0.3407,0.2261


In [13]:
pd.crosstab(insur['diabetic'], insur['region'], normalize='columns') 

region,northeast,northwest,southeast,southwest
diabetic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.4762,0.5507,0.509,0.5414
Yes,0.5238,0.4493,0.491,0.4586


In [14]:
insur_ct = pd.crosstab(insur['diabetic'], insur['region'], margins=False)
insur_ct.div(insur_ct.sum(axis=1), axis=0) 

region,northeast,northwest,southeast,southwest
diabetic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.1583,0.2734,0.3237,0.2446
Yes,0.19,0.2433,0.3407,0.2261


In [15]:
# Transpose
insur_ct.transpose()

diabetic,No,Yes
region,Unnamed: 1_level_1,Unnamed: 2_level_1
northeast,110,121
northwest,190,155
southeast,225,217
southwest,170,144


# Additional Example: How to deal with two abnormal scenarios?

In [16]:
mv = pd.read_csv('movie.csv', index_col = 0)
mv.head()

Unnamed: 0_level_0,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,Tom Hardy,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,Doug Walker,...,,,,,,,12.0,7.1,,0


## Scenario 1: Treat numeric variables as qualitative features

In [17]:
# describe by bins
bins= [30,60,90,120, 150, 180, 210, 240, 270, 300, np.inf]
labels= ["30-60", "60-90", "90-120", "120-150", "150-180", "180-210", 
         "210-240", "240-270", "270-300","more than 300"]
mv_duration = pd.DataFrame(mv.groupby(pd.cut(mv.duration, bins= bins, labels= labels))['gross'].describe())
mv_duration

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
duration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
30-60,9.0,4200652.7778,6098646.1998,145109.0,333658.0,447093.0,5923044.0,18642318.0
60-90,604.0,31824598.5132,48234295.8709,721.0,1016190.75,13638910.5,42638165.0,422783777.0
90-120,2583.0,40938322.4971,54594611.5864,162.0,4869621.5,22160085.0,55169062.0,436471036.0
120-150,786.0,71935958.3142,83953526.8221,3330.0,15753450.75,43888720.5,99267056.25,652177271.0
150-180,120.0,121607058.875,142375106.2501,14873.0,26396229.75,73652997.5,165653041.75,760505847.0
180-210,31.0,140380018.9032,152043839.339,46495.0,29662404.0,96067179.0,218051260.0,658672302.0
210-240,14.0,64573134.6429,64534662.2971,6000000.0,14052501.25,40184996.5,100047396.5,198655278.0
240-270,2.0,31525000.0,37087750.6732,5300000.0,18412500.0,31525000.0,44637500.0,57750000.0
270-300,5.0,22865583.6,31653719.8391,454255.0,10769960.0,11433134.0,12870569.0,78800000.0
more than 300,3.0,2047233.6667,2226748.8466,145118.0,822559.0,1500000.0,2998291.5,4496583.0


## Scenario 2: When Categories do not catch features well

In [19]:
mv['genres']

color
Color    Action|Adventure|Fantasy|Sci-Fi
Color           Action|Adventure|Fantasy
Color          Action|Adventure|Thriller
Color                    Action|Thriller
NaN                          Documentary
                      ...               
Color                       Comedy|Drama
Color       Crime|Drama|Mystery|Thriller
Color              Drama|Horror|Thriller
Color               Comedy|Drama|Romance
Color                        Documentary
Name: genres, Length: 5043, dtype: object

In [20]:
mv.groupby('genres')['gross'].mean()

genres
Action                                                             59520906.6667
Action|Adventure                                                  124983127.8000
Action|Adventure|Animation|Comedy|Crime|Family|Fantasy             13596911.0000
Action|Adventure|Animation|Comedy|Drama|Family|Fantasy|Thriller              nan
Action|Adventure|Animation|Comedy|Drama|Family|Sci-Fi             122823508.0000
                                                                       ...      
Sci-Fi|Thriller                                                    29793790.0000
Thriller                                                             540969.6667
Thriller|War                                                                 nan
Thriller|Western                                                             nan
Western                                                            11993215.5000
Name: gross, Length: 914, dtype: float64

In [21]:
mv_1= mv.sort_values("movie_title", ascending= True)
mv_1["genres"]= mv_1["genres"].str.replace("|", " ")
mv_1["genres"]= mv_1["genres"].str.split(" ")
mv_1.genres


color
Color            [Drama, Horror, Mystery, Thriller]
Color    [Drama, Horror, Mystery, Sci-Fi, Thriller]
Color                                       [Drama]
Color                      [Comedy, Drama, Romance]
NaN                                        [Comedy]
                            ...                    
Color                                      [Horror]
Color                    [Horror, Sci-Fi, Thriller]
Color          [Action, Adventure, Crime, Thriller]
Color                 [Action, Adventure, Thriller]
Color                              [Action, Sci-Fi]
Name: genres, Length: 5043, dtype: object

In [22]:
genres_list=[]

for lst in mv_1.genres:
    for strg in lst:
        if strg not in genres_list:
            genres_list.append(strg)
genres_list

['Drama',
 'Horror',
 'Mystery',
 'Thriller',
 'Sci-Fi',
 'Comedy',
 'Romance',
 'Adventure',
 'Family',
 'Crime',
 'Action',
 'Biography',
 'History',
 'Fantasy',
 'War',
 'Musical',
 'Documentary',
 'Music',
 'Sport',
 'Western',
 'Animation',
 'News',
 'Film-Noir',
 'Short',
 'Reality-TV',
 'Game-Show']

In [23]:
genres_mean=[]

for i in genres_list:   
    mv_with_i = mv[mv['genres'].str.contains(i)]
    mean= mv_with_i["gross"].mean()
    genres_mean.append(mean)
genres_mean    

[34538926.88971963,
 33561593.6350365,
 44445346.2254902,
 46890571.625106744,
 82244105.7367387,
 47683480.197044335,
 41825587.07023061,
 98405187.78421701,
 89302305.1762208,
 37154454.46123522,
 76289888.80101523,
 34372729.36259542,
 35193409.08024691,
 85447070.0284091,
 38317141.83625731,
 53146878.75454546,
 12121445.065934066,
 41429922.39361702,
 41392510.26190476,
 39306668.059701495,
 106954239.10784313,
 14359793.0,
 7927.0,
 3926267.0,
 nan,
 nan]

In [24]:
data = {'genres': genres_list, 'gross_mean': genres_mean}
mv_1_mean_bygenres = pd.DataFrame(data).sort_values('gross_mean', ascending =False)
mv_1_mean_bygenres.reset_index(inplace=True)
del mv_1_mean_bygenres['index']
mv_1_mean_bygenres


Unnamed: 0,genres,gross_mean
0,Animation,106954239.1078
1,Adventure,98405187.7842
2,Family,89302305.1762
3,Fantasy,85447070.0284
4,Sci-Fi,82244105.7367
5,Action,76289888.801
6,Musical,53146878.7545
7,Comedy,47683480.197
8,Thriller,46890571.6251
9,Mystery,44445346.2255
