In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

from hulearn.datasets import load_titanic
from hulearn.classification import FunctionClassifier

df = load_titanic(as_frame=True)
df.head()


Unnamed: 0,survived,pclass,name,sex,age,fare,sibsp,parch
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,1,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,1,0
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,0,0


In [3]:

X, y = df.drop(columns=['survived']), df['survived']

def class_based(dataf, sex='male', pclass=1):
    predicate = (dataf['sex'] == sex) & (dataf['pclass'] == pclass)
    return np.array(predicate).astype(int)

mod = FunctionClassifier(class_based, pclass=10)
params = {'pclass': [1, 2, 3], 'sex': ['male', 'female']}
grid = GridSearchCV(mod, cv=3, param_grid=params).fit(X, y)
pd.DataFrame(grid.cv_results_)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pclass,param_sex,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001779,0.000654,0.001137,0.000248,1,male,"{'pclass': 1, 'sex': 'male'}",0.52521,0.55042,0.617647,0.564426,0.039015,4
1,0.000914,0.000232,0.00067,0.000172,1,female,"{'pclass': 1, 'sex': 'female'}",0.701681,0.701681,0.710084,0.704482,0.003961,1
2,0.000566,1.4e-05,0.000481,1.9e-05,2,male,"{'pclass': 2, 'sex': 'male'}",0.495798,0.516807,0.478992,0.497199,0.01547,5
3,0.000602,2.4e-05,0.000485,6e-06,2,female,"{'pclass': 2, 'sex': 'female'}",0.668067,0.705882,0.668067,0.680672,0.017826,2
4,0.000591,1.8e-05,0.000495,2e-05,3,male,"{'pclass': 3, 'sex': 'male'}",0.378151,0.336134,0.323529,0.345938,0.023352,6
5,0.000553,1.4e-05,0.000526,7.1e-05,3,female,"{'pclass': 3, 'sex': 'female'}",0.617647,0.558824,0.571429,0.582633,0.025288,3


In [7]:
from hulearn.classification import FunctionClassifier
from hulearn.experimental import CaseWhenRuler

def make_prediction(dataf, age=15):
    ruler = CaseWhenRuler(default=0)

    (ruler
     .add_rule(lambda d: (d['V11'] > 4), 1)
     .add_rule(lambda d: (d['V17'] < -3), 1)
     .add_rule(lambda d: (d['V14'] < -8), 1))

    return ruler.predict(dataf)

clf = FunctionClassifier(make_prediction)



Rule Based Classification

In [9]:
import numpy as np 
import pandas as pd 
import os

persona = pd.read_csv('input/persona.csv')
persona.head()


Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [11]:
#Question 1: How many unique SOURCE are there? What are their frequencies?
print("Unique SOURCE", persona["SOURCE"].nunique())
print("Frequencies of SOURCE:", persona['SOURCE'].value_counts())
#Question 2:How many unique PRICEs are there?
print("Unique PRICE", persona["PRICE"].nunique())
#Question 3: How many sales were made from which PRICE?
print("Frequencies of PRICE", persona['PRICE'].value_counts())
#Question 4: How many sales were made from which country
print("Frequencies of COUNTRY", persona['COUNTRY'].value_counts())
#Question 5: How much was earned from sales by country?
print("Winnings by COUNTRY:", persona.groupby("COUNTRY")[['PRICE']].aggregate("sum"))
#Question 6: What are the sales numbers by SOURCE types
print("Amount of sales by SOURCE", persona['SOURCE'].value_counts())
#Question 7: What are the PRICE averages by country?
print(("Mean PRICE by COUNTRY", persona.groupby('COUNTRY')[["PRICE"]].aggregate("mean")))
#Question 8: What are the PRICE averages according to SOURCEs
print(("Mean PRICE by SOURCE", persona.groupby('SOURCE')[["PRICE"]].aggregate("mean")))
#Question 9: What are the PRICE averages in the COUNTRY-SOURCE breakdown?
print(("Mean PRICE by COUNTRY and SOURCE", persona.groupby(['COUNTRY', 'SOURCE'])[["PRICE"]].aggregate("mean").unstack()))
print(("Mean PRICE by COUNTRY", persona.groupby(['COUNTRY', 'SOURCE', 'SEX', 'AGE'])[["PRICE"]].aggregate("mean").head()))

Unique SOURCE 2
Frequencies of SOURCE: SOURCE
android    2974
ios        2026
Name: count, dtype: int64
Unique PRICE 6
Frequencies of PRICE PRICE
29    1305
39    1260
49    1031
19     992
59     212
9      200
Name: count, dtype: int64
Frequencies of COUNTRY COUNTRY
usa    2065
bra    1496
deu     455
tur     451
fra     303
can     230
Name: count, dtype: int64
Winnings by COUNTRY:          PRICE
COUNTRY       
bra      51354
can       7730
deu      15485
fra      10177
tur      15689
usa      70225
Amount of sales by SOURCE SOURCE
android    2974
ios        2026
Name: count, dtype: int64
('Mean PRICE by COUNTRY',              PRICE
COUNTRY           
bra      34.327540
can      33.608696
deu      34.032967
fra      33.587459
tur      34.787140
usa      34.007264)
('Mean PRICE by SOURCE',              PRICE
SOURCE            
android  34.174849
ios      34.069102)
('Mean PRICE by COUNTRY and SOURCE',              PRICE           
SOURCE     android        ios
COUNTRY                

In [12]:
agg_df = persona.groupby(['COUNTRY', 'SOURCE', 'SEX', 'AGE'])[["PRICE"]].aggregate("mean")
agg_df= agg_df.sort_values('PRICE', ascending=False)
agg_df = agg_df.reset_index()
agg_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE
0,bra,android,male,46,59.0
1,usa,android,male,36,59.0
2,fra,android,female,24,59.0
3,usa,ios,male,32,54.0
4,deu,android,female,36,49.0


In [13]:
bins = [0, 18, 23, 30, 40, 70]
lab = ["0_18", "19_23", "24_30", "31_40", "41_70"]
agg_df["AGE_CAT"] = pd.cut(agg_df["AGE"], bins, labels=lab)
agg_df.head()

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,AGE_CAT
0,bra,android,male,46,59.0,41_70
1,usa,android,male,36,59.0,31_40
2,fra,android,female,24,59.0,24_30
3,usa,ios,male,32,54.0,31_40
4,deu,android,female,36,49.0,31_40


In [14]:
agg_df["customer_level_based"] = [val[0] +"_" + val[1] +"_"+ val[2] + "_" +val[5] for val in agg_df.values]
agg_df = agg_df[["customer_level_based", "PRICE"]]
agg_df.head()

Unnamed: 0,customer_level_based,PRICE
0,bra_android_male_41_70,59.0
1,usa_android_male_31_40,59.0
2,fra_android_female_24_30,59.0
3,usa_ios_male_31_40,54.0
4,deu_android_female_31_40,49.0


In [15]:
agg_df = agg_df.groupby("customer_level_based").agg({"PRICE": "mean"})
agg_df = agg_df.reset_index()
agg_df.head()

Unnamed: 0,customer_level_based,PRICE
0,bra_android_female_0_18,35.645303
1,bra_android_female_19_23,34.07734
2,bra_android_female_24_30,33.863946
3,bra_android_female_31_40,34.898326
4,bra_android_female_41_70,36.737179


In [16]:
agg_df["SEGMENT"] = pd.qcut(agg_df['PRICE'],4,labels=['D','C','B','A'])
agg_df

Unnamed: 0,customer_level_based,PRICE,SEGMENT
0,bra_android_female_0_18,35.645303,B
1,bra_android_female_19_23,34.077340,C
2,bra_android_female_24_30,33.863946,C
3,bra_android_female_31_40,34.898326,B
4,bra_android_female_41_70,36.737179,A
...,...,...,...
104,usa_ios_male_0_18,33.983495,C
105,usa_ios_male_19_23,34.901872,B
106,usa_ios_male_24_30,34.838143,B
107,usa_ios_male_31_40,36.206324,A


In [17]:
new_user = 'tur_android_female_19_23'
agg_df[agg_df['customer_level_based'] == new_user]
# We came to a conclusion that a woman from Turkey who uses android and between the ages of 19-23 can yield a return us 33,44 approximately


Unnamed: 0,customer_level_based,PRICE,SEGMENT
70,tur_android_female_19_23,33.444444,C
