In [220]:
import pandas as pd
import numpy as np
import sklearn

In [221]:
df = pd.read_csv("course_lead_scoring.csv")
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [222]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

# Manual Method

In [223]:
# df['lead_source'] = df['lead_source'].fillna('NA', inplace = True)
# df['industry'] = df['industry'].fillna('NA', inplace = True)
# df['employment_status'] = df['employment_status'].fillna('NA', inplace = True)
# df['location'] = df['location'].fillna('NA', inplace = True)

In [224]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

# Using Loop Method

In [225]:
# Select columns with 'object' or 'category' data type
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
categorical_cols

Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')

In [226]:
for col in categorical_cols:
    # Check if the column actually has missing values
    if df[col].isnull().any():
        # Replace NaN with the string 'NA'
        df[col].fillna('NA', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('NA', inplace=True)


In [227]:
for col in categorical_cols:
    df[col] = df[col].astype(str)

In [228]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [229]:
# Select columns with 'float64', 'int64' or 'numerical' data type
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
numerical_cols

Index(['number_of_courses_viewed', 'annual_income', 'interaction_count',
       'lead_score', 'converted'],
      dtype='object')

In [230]:
for col in numerical_cols:
    # Check if the column actually has missing values
    if df[col].isnull().any():
        # Replace NaN with the string '0'
        df[col].fillna(0, inplace=True)
    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


In [231]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [232]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [233]:
df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

# Setting up validation framework

In [234]:
from sklearn.model_selection import train_test_split

In [235]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [236]:
len(df_full_train), len(df_test)

(1169, 293)

In [237]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

In [238]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [239]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [240]:
# Select columns with 'float64', 'int64' or 'numerical' data type
# numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
# numerical_cols

In [241]:
# Create a correlation matrix for the numerical features
correlation_matrix = df[numerical_cols].corr()

In [242]:
# Define the numerical columns used for correlation
numerical_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
numerical_cols

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [243]:
# Define the specific pairs to check
pairs = {
    'interaction_count and lead_score': correlation_matrix.loc['interaction_count', 'lead_score'],
    'number_of_courses_viewed and lead_score': correlation_matrix.loc['number_of_courses_viewed', 'lead_score'],
    'number_of_courses_viewed and interaction_count': correlation_matrix.loc['number_of_courses_viewed', 'interaction_count'],
    'annual_income and interaction_count': correlation_matrix.loc['annual_income', 'interaction_count']
}

In [244]:
# Find the pair with the biggest absolute correlation
biggest_correlation_pair = max(pairs, key=lambda k: abs(pairs[k]))

In [245]:
# Print the results
print("Correlation Matrix:")
print(correlation_matrix.to_markdown(numalign="left", stralign="left"))
print("\nCorrelations for the specified pairs:")
print(pd.Series(pairs).to_markdown(numalign="left", stralign="left"))
print("\nThe pair with the biggest correlation (absolute value) is:", biggest_correlation_pair)

Correlation Matrix:
|                          | number_of_courses_viewed   | annual_income   | interaction_count   | lead_score   | converted   |
|:-------------------------|:---------------------------|:----------------|:--------------------|:-------------|:------------|
| number_of_courses_viewed | 1                          | 0.00977029      | -0.0235652          | -0.004879    | 0.435914    |
| annual_income            | 0.00977029                 | 1               | 0.0270365           | 0.0156095    | 0.0531314   |
| interaction_count        | -0.0235652                 | 0.0270365       | 1                   | 0.00988818   | 0.374573    |
| lead_score               | -0.004879                  | 0.0156095       | 0.00988818          | 1            | 0.193673    |
| converted                | 0.435914                   | 0.0531314       | 0.374573            | 0.193673     | 1           |

Correlations for the specified pairs:
|                                                | 0

In [246]:
from sklearn.metrics import mutual_info_score

In [247]:
# Select columns with 'objects', 'categorical' data type
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
categorical_cols

Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')

In [248]:
round(mutual_info_score(df_train.converted, df_train.lead_source),2)

0.04

In [249]:
round(mutual_info_score(df_train.converted, df_train.industry),2)

0.01

In [250]:
# define the function 'mutual_info_score'
def calculate_mutual_info_score (series):
    return round(mutual_info_score(series, df_train.converted),2)

In [251]:
# Apply function on the categorical columns
mi_score = df_train[categorical_cols].apply(calculate_mutual_info_score)
mi_score

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [252]:
mi_score1 = df_train[numerical_cols].apply(calculate_mutual_info_score)
mi_score1

  type_label = type_of_target(labels_true)


number_of_courses_viewed    0.11
annual_income               0.58
interaction_count           0.08
lead_score                  0.08
dtype: float64

# One hot encoding

In [253]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [254]:
from sklearn.feature_extraction import DictVectorizer

In [255]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [256]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1462 non-null   object 
 1   industry                  1462 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1462 non-null   float64
 4   employment_status         1462 non-null   object 
 5   location                  1462 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [257]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [258]:
categorical_cols = ['lead_source', 'industry', 'employment_status', 'location']
numerical_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [259]:
train_dicts = df_train[categorical_cols  + numerical_cols].to_dict(orient = 'records')

In [260]:
train_dicts[0]

{'lead_source': 'paid_ads',
 'industry': 'retail',
 'employment_status': 'student',
 'location': 'middle_east',
 'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'interaction_count': 5,
 'lead_score': 0.03}

In [261]:
dv = DictVectorizer(sparse = False)

In [262]:
dv.fit(train_dicts)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [263]:
dv.transform(train_dicts)

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

# Inputing this into the model

In [264]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [265]:
X = df.drop("converted", axis=1)
y = df["converted"]

In [266]:
train_dicts = df_train[categorical_cols + numerical_cols].to_dict(orient = 'records')

In [267]:
dv = DictVectorizer(sparse = False)

In [268]:
X_train = dv.fit_transform(train_dicts)

In [269]:
val_dicts = df_val[categorical_cols + numerical_cols].to_dict(orient = 'records')

In [270]:
X_val = dv.fit_transform(val_dicts)

In [271]:
# Define the model pipeline
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [272]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [273]:
# Assuming X_train is a Pandas DataFrame

# 1. Apply One-Hot Encoding to convert all string/categorical columns into numerical
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)

# 2. Fit the model with the encoded data
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [274]:
# Predict and evaluate
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {round(accuracy, 2)}")

Validation Accuracy: 0.74


# For Feature Elimination

In [275]:
# This value was calculated in the previous step (Question 4)
original_accuracy = 0.7372013651877133

In [276]:
# Define the model pipeline
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [277]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [278]:
# Assuming X_train is a Pandas DataFrame

# 1. Apply One-Hot Encoding to convert all string/categorical columns into numerical
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)

# 2. Fit the model with the encoded data
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [279]:
# Prediction and Evaluation
y_pred = model.predict(X_val)
new_accuracy = accuracy_score(y_val, y_pred)

In [280]:
# Calculate difference: Original Accuracy - New Accuracy
difference = original_accuracy - new_accuracy

In [281]:
results[feature] = {'new_accuracy': new_accuracy, 'difference': difference, 'abs_difference': abs(difference)}

TypeError: list indices must be integers or slices, not str

In [292]:
# Print the final results
print("Original Accuracy:", original_accuracy)
print("\nResults for Feature Elimination:")
for feature, res in results.items():
    print(f"Feature excluded: '{feature}'")
    print(f"  New Accuracy: {res['new_accuracy']:.10f}")
    print(f"  Difference (Original - New): {res['difference']:.10f}")
    print(f"  Absolute Difference: {res['abs_difference']:.10f}")


Original Accuracy: 0.7372013651877133

Results for Feature Elimination:


AttributeError: 'list' object has no attribute 'items'

In [283]:
print(f"\nFeature with the smallest absolute difference: '{min_abs_diff_feature}'")


Feature with the smallest absolute difference: 'industry'


# Parameter Tuning

In [284]:
X = df.drop("converted", axis=1)
y = df["converted"]

In [285]:
t_dicts = df_train[categorical_cols + numerical_cols].to_dict(orient = 'records')

In [286]:
dv = DictVectorizer(sparse = False)

In [287]:
X_train = dv.fit_transform(t_dicts)

In [288]:
v_dicts = df_val[categorical_cols + numerical_cols].to_dict(orient = 'records')

In [289]:
X_val = dv.fit_transform(v_dicts)

In [290]:
# Define the model pipeline
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)