<a href="https://colab.research.google.com/github/KennethLengo/MISTEST2026/blob/main/ML_BASICS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Generate sample data
data = pd.read_csv('/content/drive/MyDrive/Colab Files /housing_with_location.csv')
#ChatGPT prompts and Data source with 100+ rows
df = pd.DataFrame(data)
# Features and target
X = df[['square_footage', 'location']]
y = df['price']
# Preprocessing: One-hot encode the location column
preprocessor = ColumnTransformer(
transformers=[
('location', OneHotEncoder(sparse_output=False), ['location'])
], remainder='passthrough')
# Create pipeline with preprocessing and model
model = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', LinearRegression())
])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train model
model.fit(X_train, y_train)
# Make prediction for a new house: 2000 sq ft in each location (Downtown, Suburb, or Rural)
new_houses = pd.DataFrame({'square_footage': [2000, 2000, 2000], 'location': ['Downtown', 'Suburb', 'Rural']})
predicted_prices = model.predict(new_houses)
for location, price in zip(new_houses['location'], predicted_prices):
  print(f"Predicted price for a 2000 sq ft house in {location}: ${price:,.2f}")
# Display model coefficients
feature_names = (model.named_steps['preprocessor'].named_transformers_['location'].get_feature_names_out(['location'])).tolist() +['square_footage']
coefficients = model.named_steps['regressor'].coef_
print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
  print(f"{feature}: {coef:.2f}")

#The model predicts that the downtown location has the strongest positive effect on the price, with a model coefficient of 1709.61. This means that a house in downtown will cost $1709.61 more than a suburb house.
#The model predicts that the rural location has the strongest negative effect on the price, with a model coefficient of -1670.88. This means that a rural house will cost $1670.88 less than a suburb house.
#The model predicts that the suburb location is the baseline property, since it is closest to zero.
#Each extra square foot is going to cost $230.

Predicted price for a 2000 sq ft house in Downtown: $465,920.63
Predicted price for a 2000 sq ft house in Suburb: $464,172.31
Predicted price for a 2000 sq ft house in Rural: $462,540.15

Model Coefficients:
location_Downtown: 1709.61
location_Rural: -1670.88
location_Suburb: -38.72
square_footage: 230.28


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Generate sample customer data
data = {
'age': [25, 34, 45, 28, 52, 36, 41, 29, 47, 33],
'monthly_usage_hours': [10, 50, 20, 15, 60, 30, 25, 12, 55, 40],
'purchase_amount': [100, 250, 150, 80, 300, 200, 175, 90, 280, 220],
'customer_service_calls': [5, 2, 8, 6, 1, 3, 7, 4, 0, 2],
'region': ['North', 'South', 'West', 'East', 'South', 'North', 'West', 'East',
'South', 'North'],
'churn': [1, 0, 1, 1, 0, 0, 1, 1, 0, 0] # 1 = churned, 0 = not churned
}
df = pd.DataFrame(data)
# Features and target
X = df[['age', 'monthly_usage_hours', 'purchase_amount', 'customer_service_calls',
'region']]
y = df['churn']
# Preprocessing: Scale numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['age', 'monthly_usage_hours', 'purchase_amount',
'customer_service_calls']),
('cat', OneHotEncoder(sparse_output=False), ['region'])
])
# Create pipeline with preprocessing and model
model = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(random_state=42))
])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train model
model.fit(X_train, y_train)
# Predict churn probability for a new customer
new_customer = pd.DataFrame({
'age': [35],
'monthly_usage_hours': [20],
'purchase_amount': [150],
'customer_service_calls': [5],
'region': ['West']
})
churn_probability = model.predict_proba(new_customer)[0][1] # Probability of churn (class 1)
# Classify based on threshold (0.5)
threshold = 0.5
churn_prediction = 1 if churn_probability > threshold else 0
print(f"Churn Probability for new customer: {churn_probability:.2f}")
print(f"Churn Prediction (1 = churn, 0 = no churn): {churn_prediction}")
# Display model coefficients
feature_names = (model.named_steps['preprocessor']
.named_transformers_['cat']
.get_feature_names_out(['region'])).tolist() + ['age',
'monthly_usage_hours', 'purchase_amount', 'customer_service_calls']
coefficients = model.named_steps['classifier'].coef_[0]
print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
  print(f"{feature}: {coef:.2f}")

Churn Probability for new customer: 0.82
Churn Prediction (1 = churn, 0 = no churn): 1

Model Coefficients:
region_East: -0.13
region_North: -0.62
region_South: -0.67
region_West: 0.80
age: 0.17
monthly_usage_hours: -0.38
purchase_amount: -0.03
customer_service_calls: 0.24


In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Generate sample customer data
data = {
'annual_spending': [500, 1200, 300, 1500, 800, 200, 1000, 600, 1300, 400],
'purchase_frequency': [5, 12, 3, 15, 8, 2, 10, 6, 13, 4],
'age': [25, 34, 45, 28, 52, 36, 41, 29, 47, 33],
'region': ['North', 'South', 'West', 'East', 'South', 'North', 'West', 'East',
'South', 'North']
}
df = pd.DataFrame(data)
# Preprocess data: Select numerical features and scale them
features = ['annual_spending', 'purchase_frequency', 'age']
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Determine optimal number of clusters using elbow method
inertia = []
K = range(1, 6)
for k in K:
  kmeans = KMeans(n_clusters=k, random_state=42)
  kmeans.fit(X_scaled)
  inertia.append(kmeans.inertia_)
# Plot elbow curve
plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.savefig('elbow_plot.png')
plt.close()
# Apply K-Means with optimal K (e.g., 3 based on elbow method)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)
# Analyze clusters
cluster_summary = df.groupby('cluster')[features].mean().round(2)
print("Cluster Characteristics:")
print(cluster_summary)
# Example of targeted strategies
for cluster in range(optimal_k):
  print(f"\nCluster {cluster} Strategy:")
  if cluster_summary.loc[cluster, 'annual_spending'] > 1000:
    print("High-spending customers: Offer exclusive promotions or loyalty rewards.")
  elif cluster_summary.loc[cluster, 'purchase_frequency'] > 10:
    print("Frequent buyers: Provide bulk discounts or subscription plans.")
  else:
    print("Low-engagement customers: Send personalized re-engagement campaigns.")
# Save cluster assignments to CSV
df.to_csv('customer_segments.csv', index=False)

Cluster Characteristics:
         annual_spending  purchase_frequency   age
cluster                                           
0                 1500.0               15.00  28.0
1                  400.0                4.00  33.6
2                 1075.0               10.75  43.5

Cluster 0 Strategy:
High-spending customers: Offer exclusive promotions or loyalty rewards.

Cluster 1 Strategy:
Low-engagement customers: Send personalized re-engagement campaigns.

Cluster 2 Strategy:
High-spending customers: Offer exclusive promotions or loyalty rewards.
