Import packages

In [1]:
# 引入套件
import pandas as pd
import numpy as np
import scipy.stats as stats
import re
import collections

Read data

In [2]:
# 讀取資料
train_data = pd.read_csv("train.csv")
df = train_data.drop(["policy_id", "is_claim"], axis=1)
print(df.head(5))

   policy_tenure  age_of_car  age_of_policyholder area_cluster  \
0       0.515874        0.05             0.644231           C1   
1       0.672619        0.02             0.375000           C2   
2       0.841110        0.02             0.384615           C3   
3       0.900277        0.11             0.432692           C4   
4       0.596403        0.11             0.634615           C5   

   population_density  make segment model fuel_type     max_torque  ...  \
0                4990     1       A    M1       CNG   60Nm@3500rpm  ...   
1               27003     1       A    M1       CNG   60Nm@3500rpm  ...   
2                4076     1       A    M1       CNG   60Nm@3500rpm  ...   
3               21622     1      C1    M2    Petrol  113Nm@4400rpm  ...   
4               34738     2       A    M3    Petrol   91Nm@4250rpm  ...   

  is_rear_window_defogger is_brake_assist  is_power_door_locks  \
0                      No              No                   No   
1                   

View type of data

In [3]:
# 檢查資料是否有NA值並刪除id欄位
print(train_data.isna().sum().sum())
print(train_data.shape)
print(train_data.dtypes)

0
(58592, 44)
policy_id                            object
policy_tenure                       float64
age_of_car                          float64
age_of_policyholder                 float64
area_cluster                         object
population_density                    int64
make                                  int64
segment                              object
model                                object
fuel_type                            object
max_torque                           object
max_power                            object
engine_type                          object
airbags                               int64
is_esc                               object
is_adjustable_steering               object
is_tpms                              object
is_parking_sensors                   object
is_parking_camera                    object
rear_brakes_type                     object
displacement                          int64
cylinder                              int64
transmission_type 

Categorical variables

In [4]:
# 觀察類別變數的敘述統計
cat_var = df.select_dtypes("object")
cat_var.describe()

Unnamed: 0,area_cluster,segment,model,fuel_type,max_torque,max_power,engine_type,is_esc,is_adjustable_steering,is_tpms,...,is_rear_window_washer,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert
count,58592,58592,58592,58592,58592,58592,58592,58592,58592,58592,...,58592,58592,58592,58592,58592,58592,58592,58592,58592,58592
unique,22,6,11,3,9,9,11,2,2,2,...,2,2,2,2,2,2,2,2,2,2
top,C8,B2,M1,Petrol,113Nm@4400rpm,88.50bhp@6000rpm,F8D Petrol Engine,No,Yes,No,...,No,No,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes
freq,13654,18314,14948,20532,17796,17796,14948,40191,35526,44574,...,41634,38077,32177,42435,42435,57383,34291,36309,42435,58229


Continuous variables

In [5]:
# 觀察連續變數的敘述統計
conti_var = df.select_dtypes(["int64", "float64"])
conti_var.describe()

Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,population_density,make,airbags,displacement,cylinder,gear_box,turning_radius,length,width,height,gross_weight,ncap_rating
count,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0
mean,0.611246,0.069424,0.46942,18826.858667,1.763722,3.137066,1162.355851,3.626963,5.245443,4.852893,3850.476891,1672.233667,1553.33537,1385.276813,1.75995
std,0.414156,0.056721,0.122886,17660.174792,1.136988,1.832641,266.304786,0.483616,0.430353,0.228061,311.457119,112.089135,79.62227,212.423085,1.389576
min,0.002735,0.0,0.288462,290.0,1.0,1.0,796.0,3.0,5.0,4.5,3445.0,1475.0,1475.0,1051.0,0.0
25%,0.21025,0.02,0.365385,6112.0,1.0,2.0,796.0,3.0,5.0,4.6,3445.0,1515.0,1475.0,1185.0,0.0
50%,0.573792,0.06,0.451923,8794.0,1.0,2.0,1197.0,4.0,5.0,4.8,3845.0,1735.0,1530.0,1335.0,2.0
75%,1.039104,0.11,0.548077,27003.0,3.0,6.0,1493.0,4.0,5.0,5.0,3995.0,1755.0,1635.0,1510.0,3.0
max,1.396641,1.0,1.0,73430.0,5.0,6.0,1498.0,4.0,6.0,5.2,4300.0,1811.0,1825.0,1720.0,5.0


Cramer's V Test between categorical feature and categorical feature

In [6]:
# 定義檢定函數(Cramer's V test)
def cramersV(f1, f2):
    table = pd.crosstab(f1, f2)
    x2 = stats.chi2_contingency(table, correction=False)[0]
    V = np.sqrt((x2/len(f1))/(min(len(f1.unique())-1, len(f2.unique())-1)))
    return V

# 計算類別型特徵的相關係數
col = cat_var.columns
cat_comb1 = []
cat_comb2 = []
cat_cramer = []
for i in range(len(col)):
    for j in range(i+1, len(col)):
        cramer = round(cramersV(cat_var[col[i]], cat_var[col[j]]), 4)
        # cat_comb.append(" & ".join([col[i], col[j]]))
        cat_comb1.append(col[i])
        cat_comb2.append(col[j])
        cat_cramer.append(cramer)

# 建立相關性強度排序表(取前20)
chisqstest = pd.DataFrame(cat_comb1)
chisqstest["colname2"] = cat_comb2
chisqstest["cramer's V"] = cat_cramer
chisqstest.columns = ["colname1", "colname2", "cramer's V"]
chisqstest.sort_values(by="cramer's V", ascending=False)

Unnamed: 0,colname1,colname2,cramer's V
74,model,is_speed_alert,1.0000
156,engine_type,is_power_steering,1.0000
105,max_torque,rear_brakes_type,1.0000
103,max_torque,is_parking_sensors,1.0000
102,max_torque,is_tpms,1.0000
...,...,...,...
21,area_cluster,is_power_steering,0.0275
25,area_cluster,is_speed_alert,0.0222
230,is_parking_sensors,is_speed_alert,0.0162
243,is_parking_camera,is_day_night_rear_view_mirror,0.0142


Pearson Correlation between continuous feature and continuous feature

In [7]:
# 計算連續型特徵之間的相關係數
col = conti_var.columns
conti_comb1 = []
conti_comb2 = []
conti_rho = []
for i in range(len(col)):
    for j in range(i+1, len(col)):
        rho = abs(round(np.corrcoef(conti_var[col[i]], conti_var[col[j]])[0][1], 4))
        # conti_comb.append(" & ".join([col[i], col[j]]))
        conti_comb1.append(col[i])
        conti_comb2.append(col[j])
        conti_rho.append(rho)

# 建立相關性強度排序表(取前20)
pearsoncorr = pd.DataFrame(conti_comb1)
pearsoncorr["colname2"] = conti_comb2
pearsoncorr["pearson_abs"] = conti_rho
pearsoncorr.columns = ["colname1", "colname2", "abs(pearson)"]
a = pearsoncorr.sort_values(by="abs(pearson)", ascending=False)[:20]
a
# a.to_csv("/Users/yinjianzhang/Desktop/NCKU/碩二上/test.csv")

Unnamed: 0,colname1,colname2,abs(pearson)
72,displacement,length,0.9617
90,turning_radius,length,0.9449
95,length,width,0.9159
73,displacement,width,0.8993
88,gear_box,gross_weight,0.8946
71,displacement,turning_radius,0.8754
69,displacement,cylinder,0.8662
80,cylinder,width,0.8624
84,gear_box,turning_radius,0.8617
97,length,gross_weight,0.8617
