In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('course_lead_scoring.csv')

# Display the first few rows
print(df.head())


    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  


In [3]:
# Show number of missing values per column
print(df.isnull().sum())


lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


### Question 1
**What is the most frequent observation (mode) for the column industry?**

In [9]:
# Find the mode (most frequent value) in the 'industry' column
mode_industry = df['industry'].mode()[0]

print("Most frequent industry:", mode_industry)

Most frequent industry: retail


### Question 1

In [11]:
# Clean missing values
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['number']).columns

df[categorical_cols] = df[categorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

# Create correlation matrix for numerical features
corr_matrix = df[numerical_cols].corr()

# Print correlation values for the specified pairs
print("interaction_count and lead_score:", corr_matrix.loc['interaction_count', 'lead_score'])
print("number_of_courses_viewed and lead_score:", corr_matrix.loc['number_of_courses_viewed', 'lead_score'])
print("number_of_courses_viewed and interaction_count:", corr_matrix.loc['number_of_courses_viewed', 'interaction_count'])
print("annual_income and interaction_count:", corr_matrix.loc['annual_income', 'interaction_count'])


interaction_count and lead_score: 0.009888182496913131
number_of_courses_viewed and lead_score: -0.004878998354681276
number_of_courses_viewed and interaction_count: -0.023565222882888037
annual_income and interaction_count: 0.02703647240481443


In [14]:
from sklearn.model_selection import train_test_split

# Separate target
X = df.drop('converted', axis=1)
y = df['converted']

# First split: train (60%), temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42)

# Second split: val (20%), test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

# Check sizes
print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))


Train size: 877
Validation size: 292
Test size: 293


**biggest correlation:** annual_income and interaction_count

### Question 3

**Calculate the mutual information score between converted and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?**



In [20]:
from sklearn.feature_selection import mutual_info_classif

# Only using training data
categorical = ['industry', 'location', 'lead_source', 'employment_status']

# Convert categorical variables to category dtype
for col in categorical:
    X_train[col] = X_train[col].astype('category')

# Encode categories with category codes
X_train_encoded = X_train.copy()
for col in categorical:
    X_train_encoded[col] = X_train[col].cat.codes

# Compute mutual information
mi_scores = mutual_info_classif(X_train_encoded[categorical], y_train, discrete_features=True)

# Round scores to 2 decimals
mi_scores_rounded = {col: round(score, 2) for col, score in zip(categorical, mi_scores)}

# Print results
print("Mutual Information Scores:")
for col, score in mi_scores_rounded.items():
    print(f"{col}: {score}")


Mutual Information Scores:
industry: 0.02
location: 0.0
lead_source: 0.03
employment_status: 0.02


**The variable with the biggest mutual information score is:**  lead_source

### Question 4

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Reload and clean the data if needed
df = pd.read_csv('course_lead_scoring.csv')

# Fill missing values
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include='number').columns

df[categorical_cols] = df[categorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

# Split data
X = df.drop('converted', axis=1)
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# One-hot encode all object (categorical) columns
X_train_encoded = pd.get_dummies(X_train)
X_val_encoded = pd.get_dummies(X_val)

# Align columns between train and val (important!)
X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='left', axis=1, fill_value=0)

# Train logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Predict and evaluate
y_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_pred)

print("Validation Accuracy:", round(accuracy, 2))



Validation Accuracy: 0.74


### Question5

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load and prepare data
df = pd.read_csv('course_lead_scoring.csv')

# Fill missing values
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include='number').columns

df[categorical_cols] = df[categorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

# Train/val/test split
X = df.drop('converted', axis=1)
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# One-hot encode all object columns
X_train_encoded = pd.get_dummies(X_train)
X_val_encoded = pd.get_dummies(X_val)

# Align train and val
X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='left', axis=1, fill_value=0)

# Train original model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)
y_pred = model.predict(X_val_encoded)
original_accuracy = accuracy_score(y_val, y_pred)

print("Original accuracy:", original_accuracy)

# Now test removing features
features_to_test = ['industry', 'employment_status', 'lead_score']
diffs = {}

for feature in features_to_test:
    # Find all columns that came from this feature (important for one-hot encoded categoricals)
    matching_cols = [col for col in X_train_encoded.columns if col.startswith(feature)]
    
    # Drop those columns
    X_train_subset = X_train_encoded.drop(columns=matching_cols)
    X_val_subset = X_val_encoded.drop(columns=matching_cols)

    # Train and evaluate model without the feature
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_subset, y_train)
    y_pred = model.predict(X_val_subset)
    accuracy = accuracy_score(y_val, y_pred)

    # Calculate accuracy difference
    diff = original_accuracy - accuracy
    diffs[feature] = diff
    print(f"Accuracy without '{feature}': {accuracy:.4f} | Difference: {diff:.4f}")

# Identify feature with smallest difference
least_useful = min(diffs, key=diffs.get)
print(f"\n Feature with the smallest accuracy drop (least useful): {least_useful}")


Original accuracy: 0.7431506849315068
Accuracy without 'industry': 0.7432 | Difference: 0.0000
Accuracy without 'employment_status': 0.7466 | Difference: -0.0034
Accuracy without 'lead_score': 0.7432 | Difference: 0.0000

 Feature with the smallest accuracy drop (least useful): employment_status


### Question 6

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


C_values = [0.01, 0.1, 1, 10, 100]
results = {}


for c in C_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_val_encoded)
    acc = accuracy_score(y_val, y_pred)
    results[c] = round(acc, 3)
    print(f"C = {c}: Validation Accuracy = {round(acc, 3)}")

# Finding the best C
best_C = max(results, key=results.get)
print(f"\n✅ Best C value: {best_C} with accuracy {results[best_C]}")


C = 0.01: Validation Accuracy = 0.743
C = 0.1: Validation Accuracy = 0.743
C = 1: Validation Accuracy = 0.743
C = 10: Validation Accuracy = 0.743
C = 100: Validation Accuracy = 0.743

✅ Best C value: 0.01 with accuracy 0.743
