In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [4]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [6]:
df.columns= df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for column in categorical_columns:
    df[column] = df[column].str.lower().str.replace(' ', '_')
    #df[column].fillna('NA', inplace=True)
    df.fillna({column:'NA'}, inplace=True)

df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [7]:
df.isna().sum()

lead_source                   0
industry                      0
number_of_courses_viewed      0
annual_income               181
employment_status             0
location                      0
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [8]:
df.annual_income.fillna(0, inplace=True)
df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.annual_income.fillna(0, inplace=True)


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train,test_size=0.25, random_state=42) #20% of the orignal df

In [10]:
df_train= df_train.reset_index(drop= True)
df_val= df_val.reset_index(drop= True)
df_test= df_test.reset_index(drop= True)

In [11]:
y_train=df_train.converted.values
y_val=df_val.converted.values
y_test=df_test.converted.values

In [12]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [13]:
df_full_train.converted.value_counts(normalize=True)

converted
1    0.607357
0    0.392643
Name: proportion, dtype: float64

In [14]:
df_full_train.industry.value_counts(normalize=True)

industry
finance          0.142857
retail           0.142002
other            0.132592
healthcare       0.128315
education        0.122327
manufacturing    0.119760
technology       0.115483
NA               0.096664
Name: proportion, dtype: float64

In [15]:
df_train.industry.value_counts(normalize=True)

industry
retail           0.142694
finance          0.141553
healthcare       0.133562
other            0.133562
manufacturing    0.124429
education        0.121005
technology       0.114155
NA               0.089041
Name: proportion, dtype: float64

In [16]:
numerical_columns = list(df_train.dtypes[df_train.dtypes != 'object'].index)
numerical_columns

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [17]:
df_full_train[numerical_columns].corrwith(df_full_train.converted)

number_of_courses_viewed    0.442068
annual_income               0.029612
interaction_count           0.378482
lead_score                  0.225641
dtype: float64

In [18]:
round(df[numerical_columns].corr(),2)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.01,-0.02,-0.0
annual_income,0.01,1.0,0.03,0.02
interaction_count,-0.02,0.03,1.0,0.01
lead_score,-0.0,0.02,0.01,1.0


In [19]:
from sklearn.metrics import mutual_info_score

In [20]:
mutual_info_score(y_train, df_train.industry)

0.011574521435657112

In [21]:
mutual_info_score(y_train, df_train.location)

0.004464157884038034

In [22]:
mutual_info_score(y_train, df_train.lead_source)

0.03539624379726594

In [23]:
mutual_info_score(y_train, df_train.employment_status)

0.012937677269442782

In [24]:
from sklearn.feature_extraction import DictVectorizer
dv= DictVectorizer(sparse=False)
train_dict= df_train[categorical_columns + numerical_columns].to_dict(orient= 'records')
X_train= dv.fit_transform(train_dict)

val_dict=df_val[categorical_columns + numerical_columns].to_dict(orient= 'records')
X_val= dv.fit_transform(val_dict)

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:

model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [27]:
model.coef_[0]

array([ 6.93289207e-07, -1.70391745e-01,  9.84123082e-01,  3.14505594e-01,
        2.12710939e-01, -1.33666101e+00, -3.27194435e-01,  1.14812792e+00,
       -5.05282264e-02, -2.56654843e-01,  6.44500951e-02, -1.06487577e-01,
       -4.68060117e-01,  6.33985425e-04,  1.08175460e+00,  3.01215137e+00,
        4.74039044e-01, -1.04179732e-01, -6.74403501e-02, -1.48154018e+00,
        1.63200232e+00, -4.48594095e-01,  2.99784880e-01, -3.29821005e-01,
       -1.77829265e-01, -3.55260858e-02,  2.62125948e-01,  2.43343892e-01,
       -1.70133517e-01, -8.76576888e-02,  1.46648427e+00])

In [28]:
model.predict(X_train)

array([0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,

In [29]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [30]:
y_pred = model.predict_proba(X_val)[:,1]
churn_decision= (y_pred >=0.5)

In [31]:
(y_val == churn_decision).mean()

np.float64(0.6996587030716723)

In [32]:
y_pred = model.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_val)
accuracy

0.6996587030716723

Try feature elimination

In [33]:
from sklearn.feature_extraction import DictVectorizer
dv= DictVectorizer(sparse=False)

train_dict= df_train[categorical_columns + numerical_columns].to_dict(orient= 'records')
X_train= dv.fit_transform(train_dict)

val_dict=df_val[categorical_columns + numerical_columns].to_dict(orient= 'records')
X_val= dv.fit_transform(val_dict)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_new = accuracy_score(y_pred, y_val)
accuracy_new-accuracy

0.0

In [34]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Vektorozás
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

# Eredeti model teljesítménye
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
baseline_accuracy = accuracy_score(y_val, y_pred)

print(f"Baseline accuracy: {baseline_accuracy:.4f}")

# Csoportosítsuk a feature-öket eredeti változók szerint
feature_names = dv.get_feature_names_out()

# Például: 'job=admin' → 'job'
feature_map = defaultdict(list)
for i, f in enumerate(feature_names):
    var_name = f.split('=')[0] if '=' in f else f  # Pl. job=admin → job
    feature_map[var_name].append(i)

# Minden változóra: távolítsuk el a hozzá tartozó oszlopokat
results = []

for var_name, indices in feature_map.items():
    X_train_reduced = np.delete(X_train, indices, axis=1)
    X_val_reduced = np.delete(X_val, indices, axis=1)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_reduced, y_train)
    y_pred = model.predict(X_val_reduced)
    acc = accuracy_score(y_val, y_pred)
    diff = baseline_accuracy - acc

    results.append((var_name, acc, diff))

# Eredmények
df_results = pd.DataFrame(results, columns=['feature_group', 'accuracy', 'accuracy_drop'])
df_results_sorted = df_results.sort_values(by='accuracy_drop')

print("\nLeast useful variables (grouped by original feature):")
print(df_results_sorted.head(10))


Baseline accuracy: 0.6997

Least useful variables (grouped by original feature):
              feature_group  accuracy  accuracy_drop
0             annual_income  0.853242      -0.153584
6                  location  0.709898      -0.010239
4                lead_score  0.706485      -0.006826
5               lead_source  0.703072      -0.003413
2                  industry  0.699659       0.000000
1         employment_status  0.696246       0.003413
3         interaction_count  0.556314       0.143345
7  number_of_courses_viewed  0.556314       0.143345


Regularized logistic regression

In [35]:
from sklearn.linear_model import LogisticRegression

for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(
        penalty='l1',         # vagy 'l1'
        C=C,                # kisebb C -> erősebb regularizáció
        solver='liblinear',   # jó választás L1-hez is
        max_iter=1000,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    from sklearn.metrics import accuracy_score
    print("Accuracy:",C, round(accuracy_score(y_val, y_pred),3))


Accuracy: 0.01 0.652
Accuracy: 0.1 0.84
Accuracy: 1 0.863
Accuracy: 10 0.857
Accuracy: 100 0.853


In [36]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
for a in [0.01, 0.1, 1, 10, 100]:
    model = Ridge(alpha=a)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred) ** 0.5
    print(f"alpha={a} → RMSE: {rmse:.4f}")


alpha=0.01 → RMSE: 0.3627
alpha=0.1 → RMSE: 0.3627
alpha=1 → RMSE: 0.3624
alpha=10 → RMSE: 0.3607
alpha=100 → RMSE: 0.3626
