In [1]:
import pandas as pd
url="https://raw.githubusercontent.com/Gunnvant/corp_trainings/main/ola_electric/predictive_modelling/data/dm.csv"
dm = pd.read_csv(url)

In [2]:
dm.head(2)

Unnamed: 0,Age,Gender,OwnHome,Married,Location,Salary,Children,History,Catalogs,AmountSpent,Cust_Id
0,Old,Female,Own,Single,Far,47500,0,High,6,755,247
1,Middle,Male,Rent,Single,Close,63600,0,High,6,1318,127


In [3]:
dm['AmountSpent'].mean()

1216.77

In [5]:
def create_target(x):
    if x >= dm['AmountSpent'].mean():
        res = 1
    else:
        res = 0
    return res

In [6]:
dm['target'] = dm['AmountSpent'].map(create_target)

In [7]:
dm.head(2)

Unnamed: 0,Age,Gender,OwnHome,Married,Location,Salary,Children,History,Catalogs,AmountSpent,Cust_Id,target
0,Old,Female,Own,Single,Far,47500,0,High,6,755,247,0
1,Middle,Male,Rent,Single,Close,63600,0,High,6,1318,127,1


In [8]:
def combine_age(x):
    if x=="Middle" or x=="Old":
        res = "Middle-Old"
    else:
        res = "Young"
    return res
dm['Age_N']=dm['Age'].map(combine_age)

In [9]:
def combine_kids(x):
    if x==2 or x==3:
        res = "2-3"
    else:
        res = str(x)
    return res
dm['Children_N'] = dm['Children'].map(combine_kids)

In [11]:
dm['History'] = dm['History'].fillna('NewCust')

In [12]:
dm['Catalogs'] = dm['Catalogs'].astype('str')

In [13]:
train = dm.sample(frac=0.70,random_state=2)
test = dm.drop(train.index)

In [14]:
### Model ###
train.head(2)

Unnamed: 0,Age,Gender,OwnHome,Married,Location,Salary,Children,History,Catalogs,AmountSpent,Cust_Id,target,Age_N,Children_N
37,Middle,Male,Rent,Single,Close,47000,1,Medium,12,584,745,0,Middle-Old,1
726,Middle,Female,Rent,Single,Close,26200,1,Low,12,381,105,0,Middle-Old,1


In [15]:
import statsmodels.formula.api as smf

In [16]:
formula = '''target~C(Age_N)+C(Children_N)+C(Catalogs)+C(Married)+C(OwnHome)
+C(Gender)+C(History)+C(Location)+Salary
'''

In [17]:
import statsmodels.api as sm
model1 = smf.glm(formula,data=train,family=sm.families.Binomial()).fit()

In [18]:
print(model1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 target   No. Observations:                  700
Model:                            GLM   Df Residuals:                      685
Model Family:                Binomial   Df Model:                           14
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -159.42
Date:                Tue, 28 Jun 2022   Deviance:                       318.84
Time:                        15:28:18   Pearson chi2:                     418.
No. Iterations:                    24   Pseudo R-squ. (CS):             0.5919
Covariance Type:            nonrobust                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                -4.38

In [19]:
train.head(2)

Unnamed: 0,Age,Gender,OwnHome,Married,Location,Salary,Children,History,Catalogs,AmountSpent,Cust_Id,target,Age_N,Children_N
37,Middle,Male,Rent,Single,Close,47000,1,Medium,12,584,745,0,Middle-Old,1
726,Middle,Female,Rent,Single,Close,26200,1,Low,12,381,105,0,Middle-Old,1


In [20]:
def child_d(x):
    if x=="2-3":
        res = 1
    else:
        res = 0
    return res
def hist_d(x):
    if x=="Medium":
        res = 1
    else:
        res = 0 
    return res
train['Child_2_3'] = train['Children_N'].map(child_d)
train['Hist_M'] = train['History'].map(hist_d)
test['Child_2_3'] = test['Children_N'].map(child_d)
test['Hist_M'] = test['History'].map(hist_d)

In [23]:
formula2 = '''target~Child_2_3+C(Catalogs)+Hist_M+C(Location)+Salary'''
model2 = smf.glm(formula2,data=train,family=sm.families.Binomial()).fit()

In [24]:
print(model2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 target   No. Observations:                  700
Model:                            GLM   Df Residuals:                      692
Model Family:                Binomial   Df Model:                            7
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -189.42
Date:                Tue, 28 Jun 2022   Deviance:                       378.84
Time:                        15:38:29   Pearson chi2:                     719.
No. Iterations:                     7   Pseudo R-squ. (CS):             0.5553
Covariance Type:            nonrobust                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             -5.9133      0

In [25]:
train['Catalogs'].unique()

array(['12', '18', '6', '24'], dtype=object)

In [30]:
train.groupby(['Catalogs']).agg({'target':'mean'})

Unnamed: 0_level_0,target
Catalogs,Unnamed: 1_level_1
12,0.313433
18,0.54386
24,0.660494
6,0.13253


In [33]:
train.groupby(['Location']).agg({'target':['mean','sum','count']})

Unnamed: 0_level_0,target,target,target
Unnamed: 0_level_1,mean,sum,count
Location,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Close,0.354508,173,488
Far,0.528302,112,212


In [34]:
train.groupby(['Children_N']).agg({'target':['mean','sum','count']})

Unnamed: 0_level_0,target,target,target
Unnamed: 0_level_1,mean,sum,count
Children_N,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0.498489,165,331
1,0.406417,76,187
2-3,0.241758,44,182


In [35]:
train.groupby(['History']).agg({'target':['mean','sum','count']})

Unnamed: 0_level_0,target,target,target
Unnamed: 0_level_1,mean,sum,count
History,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
High,0.91573,163,178
Low,0.0,0,159
Medium,0.224359,35,156
NewCust,0.42029,87,207


In [36]:
(163+87)/(178+159+207)

0.45955882352941174

In [37]:
train['Salary_qtr']=pd.qcut(train['Salary'],4)

In [39]:
train.groupby(['Salary_qtr']).agg({'target':['mean','sum','count']}).reset_index()

Unnamed: 0_level_0,Salary_qtr,target,target,target
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sum,count
0,"(10099.999, 29975.0]",0.0,0,175
1,"(29975.0, 52850.0]",0.251429,44,175
2,"(52850.0, 77150.0]",0.56,98,175
3,"(77150.0, 168800.0]",0.817143,143,175


In [40]:
url = "https://raw.githubusercontent.com/Gunnvant/corp_trainings/main/ola_electric/predictive_modelling/data/order_cancellation.csv"
cancellation = pd.read_csv(url)
cancellation.head(2)

Unnamed: 0.1,Unnamed: 0,destinationCountry,roomCount,udid,email,phone,guest_country_code,is_cardless,stars,type,NormalizedTotalBookingPrice,NormalizedLowRate,numberOfBookedNights,numberOfReviews,AvgUserRating,hotel_id,hasSpecialRequest,hasFreeCancellation,cancelledOrNoShow
0,1,0,1,0,0,0,0,1,4.5,HOTEL,104,97,3,220,4.058,0,0,0,1
1,2,1,1,1,1,1,0,0,3.0,HOTEL,107,99,8,47,2.5556,1,0,1,1


In [42]:
probs = model2.predict(test)
act_y = test['target']

In [43]:
table = pd.DataFrame({'y':act_y,'prob':probs})

In [45]:
table.head(2)

Unnamed: 0,y,prob
0,0,0.231222
8,0,0.006477


In [47]:
table['prob_deciles']=pd.qcut(table['prob'],10)

In [48]:
table.head(2)

Unnamed: 0,y,prob,prob_deciles
0,0,0.231222,"(0.219, 0.544]"
8,0,0.006477,"(0.00398, 0.0192]"


In [53]:
gains = table.groupby('prob_deciles')['y'].agg(['sum','count']).\
                    sort_values('prob_deciles',ascending=False).reset_index()

In [56]:
gains=gains.rename(columns={'sum':'#events','count':'no_obs'})

In [58]:
gains['perc_events']=gains['#events']/gains['#events'].sum()

In [60]:
gains['cumulative_perc_events']=gains['perc_events'].cumsum()

In [61]:
gains

Unnamed: 0,prob_deciles,#events,no_obs,perc_events,cumulative_perc_events
0,"(0.982, 1.0]",30,30,0.263158,0.263158
1,"(0.921, 0.982]",25,30,0.219298,0.482456
2,"(0.828, 0.921]",24,30,0.210526,0.692982
3,"(0.544, 0.828]",18,30,0.157895,0.850877
4,"(0.219, 0.544]",11,30,0.096491,0.947368
5,"(0.0955, 0.219]",3,30,0.026316,0.973684
6,"(0.0363, 0.0955]",2,30,0.017544,0.991228
7,"(0.0192, 0.0363]",1,30,0.008772,1.0
8,"(0.00398, 0.0192]",0,30,0.0,1.0
9,"(-0.000847, 0.00398]",0,30,0.0,1.0


In [None]:
`