# Machine Learning Fundamentals

## Learning Objectives
By the end of this lesson, you will be able to:
- Understand what machine learning is and when to use it
- Build your first prediction models
- Evaluate how well your models work
- Apply ML to real business problems

## Core Concepts
- **Machine Learning**: Teaching computers to find patterns and make predictions
- **Training Data**: Examples we use to teach the algorithm
- **Model**: The algorithm that learns and makes predictions
- **Features**: Input data (like age, income, location)
- **Target**: What we want to predict (like sales, prices)

## 1. Types of Machine Learning Problems

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Two main types of problems:

print("🎯 CLASSIFICATION: Predicting categories")
print("Examples:")
print("• Email → Spam or Not Spam")
print("• Customer → Will Buy or Won't Buy") 
print("• Image → Cat, Dog, or Bird")

print("\n📊 REGRESSION: Predicting numbers")
print("Examples:")
print("• House → Price ($250,000)")
print("• Ad → Number of clicks (150)")
print("• Stock → Tomorrow's price ($45.20)")

# Create sample data to demonstrate
print("\n💡 Let's practice with real examples...")

# Classification example: Customer purchase prediction
customers = pd.DataFrame({
    'age': [25, 35, 45, 30, 55, 40, 22, 50],
    'income': [50000, 80000, 100000, 60000, 120000, 75000, 45000, 95000],
    'previous_purchases': [0, 2, 5, 1, 8, 3, 0, 6],
    'will_buy': [0, 1, 1, 0, 1, 1, 0, 1]  # 0=No, 1=Yes
})

print("\nClassification Data (Predicting if customer will buy):")
print(customers)

# Regression example: House price prediction  
houses = pd.DataFrame({
    'size_sqft': [1200, 1800, 2400, 1600, 3000, 2000, 1000, 2800],
    'bedrooms': [2, 3, 4, 3, 5, 3, 2, 4],
    'age_years': [10, 5, 2, 8, 1, 6, 15, 3],
    'price': [250000, 350000, 480000, 320000, 650000, 400000, 200000, 580000]
})

print("\nRegression Data (Predicting house price):")
print(houses)

## 2. Building Your First Models

In [None]:
# Build a Classification Model (Predict customer purchase)
print("🔍 CLASSIFICATION MODEL: Will customer buy?")

# Prepare data
X_class = customers[['age', 'income', 'previous_purchases']]
y_class = customers['will_buy']

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

# Train model
clf_model = LogisticRegression()
clf_model.fit(X_train, y_train)

# Make predictions
predictions = clf_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Model accuracy: {accuracy:.1%}")
print(f"Out of 100 customers, we correctly predict {accuracy*100:.0f}")

# Test on new customer
new_customer = [[30, 70000, 2]]  # 30 years old, $70k income, 2 previous purchases
prediction = clf_model.predict(new_customer)[0]
probability = clf_model.predict_proba(new_customer)[0][1]

print(f"\nNew customer prediction:")
print(f"Will buy: {'Yes' if prediction == 1 else 'No'}")
print(f"Confidence: {probability:.1%}")

# Build a Regression Model (Predict house price)
print(f"\n📊 REGRESSION MODEL: Predict house price")

# Prepare data
X_reg = houses[['size_sqft', 'bedrooms', 'age_years']]
y_reg = houses['price']

# Split and train
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)

# Make predictions
price_predictions = reg_model.predict(X_test_reg)
mae = mean_absolute_error(y_test_reg, price_predictions)

print(f"Average prediction error: ${mae:,.0f}")

# Test on new house
new_house = [[2000, 3, 5]]  # 2000 sqft, 3 bedrooms, 5 years old
predicted_price = reg_model.predict(new_house)[0]

print(f"\nNew house prediction:")
print(f"2000 sqft, 3BR, 5 years old → ${predicted_price:,.0f}")

# What the model learned
print(f"\nModel insights:")
print(f"Each extra sqft adds: ${reg_model.coef_[0]:.0f}")
print(f"Each extra bedroom adds: ${reg_model.coef_[1]:,.0f}")
print(f"Each year older reduces: ${-reg_model.coef_[2]:,.0f}")

## 3. Model Evaluation and Improvement

In [None]:
# Compare different algorithms
print("🏆 COMPARING DIFFERENT MODELS")

# Try Random Forest (often performs better)
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=10, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

print(f"Logistic Regression accuracy: {accuracy:.1%}")
print(f"Random Forest accuracy: {rf_accuracy:.1%}")
print(f"Random Forest is {'better' if rf_accuracy > accuracy else 'worse'}")

# Feature importance (what matters most?)
feature_names = ['age', 'income', 'previous_purchases']
importance = rf_model.feature_importances_

print(f"\nWhat matters most for predictions:")
for name, imp in zip(feature_names, importance):
    print(f"{name}: {imp:.1%} importance")

# Visualize model performance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Accuracy comparison
models = ['Logistic\nRegression', 'Random\nForest']
accuracies = [accuracy, rf_accuracy]
ax1.bar(models, accuracies, color=['lightblue', 'lightgreen'])
ax1.set_ylabel('Accuracy')
ax1.set_title('Model Comparison')
ax1.set_ylim(0, 1)

# Feature importance
ax2.bar(feature_names, importance, color='orange')
ax2.set_ylabel('Importance')
ax2.set_title('Feature Importance')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Real-world tips
print(f"\n💡 REAL-WORLD TIPS:")
print(f"✅ More data usually = better models")
print(f"✅ Clean data is more important than fancy algorithms") 
print(f"✅ Start simple, then try complex models")
print(f"✅ Always test on unseen data")
print(f"✅ Understand your business problem first")

# Practice Exercises

In [None]:
# Exercise 1: Employee salary prediction
employees = pd.DataFrame({
    'years_experience': [1, 3, 5, 2, 8, 6, 4, 10, 7, 9],
    'education_level': [1, 2, 3, 1, 3, 2, 2, 3, 3, 2],  # 1=High school, 2=Bachelor, 3=Master
    'performance_score': [3.2, 4.1, 4.8, 3.5, 4.9, 4.3, 3.8, 4.7, 4.5, 4.6],
    'salary': [45000, 55000, 75000, 48000, 95000, 70000, 60000, 110000, 85000, 100000]
})

print("Exercise 1: Predict Employee Salaries")
print("Build a model to predict salary based on experience, education, and performance")

# Your code here: Build and evaluate a regression model
X_emp = employees[['years_experience', 'education_level', 'performance_score']]
y_emp = employees['salary']

# Split, train, and evaluate
X_train_emp, X_test_emp, y_train_emp, y_test_emp = train_test_split(X_emp, y_emp, test_size=0.3, random_state=42)
emp_model = LinearRegression()
emp_model.fit(X_train_emp, y_train_emp)
emp_pred = emp_model.predict(X_test_emp)
emp_mae = mean_absolute_error(y_test_emp, emp_pred)

print(f"Salary prediction error: ${emp_mae:,.0f}")

# Predict for new employee
new_employee = [[5, 2, 4.2]]  # 5 years experience, Bachelor's, 4.2 performance
predicted_salary = emp_model.predict(new_employee)[0]
print(f"New employee predicted salary: ${predicted_salary:,.0f}")

# Exercise 2: Product recommendation
print(f"\nExercise 2: Product Purchase Prediction")
products = pd.DataFrame({
    'customer_age': [25, 35, 45, 30, 55, 40, 28, 50, 33, 42],
    'browse_time_min': [5, 15, 25, 8, 30, 20, 12, 35, 18, 22],
    'previous_orders': [0, 2, 8, 1, 12, 5, 3, 15, 4, 9],
    'cart_value': [0, 50, 150, 25, 200, 100, 75, 250, 80, 180],
    'purchased': [0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
})

# Build classification model
X_prod = products[['customer_age', 'browse_time_min', 'previous_orders', 'cart_value']]
y_prod = products['purchased']

prod_model = RandomForestClassifier(n_estimators=10, random_state=42)
prod_model.fit(X_prod, y_prod)
prod_accuracy = prod_model.score(X_prod, y_prod)

print(f"Purchase prediction accuracy: {prod_accuracy:.1%}")

# Feature importance
feature_names = ['age', 'browse_time', 'previous_orders', 'cart_value']
importance = prod_model.feature_importances_
print(f"Most important factor: {feature_names[np.argmax(importance)]}")

# Exercise 3: Compare algorithms
print(f"\nExercise 3: Algorithm Comparison")
print("Try different algorithms on the same dataset and compare:")

algorithms = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=10, random_state=42)
}

# Use customer purchase data
for name, algorithm in algorithms.items():
    if name == 'Linear Regression':
        # Skip for classification (use on salary data instead)
        algorithm.fit(X_train_emp, y_train_emp)
        pred = algorithm.predict(X_test_emp)
        score = mean_absolute_error(y_test_emp, pred)
        print(f"{name} (salary prediction) - MAE: ${score:,.0f}")
    else:
        algorithm.fit(X_train, y_train)
        pred = algorithm.predict(X_test)
        score = accuracy_score(y_test, pred)
        print(f"{name} (purchase prediction) - Accuracy: {score:.1%}")

# Exercise 4: Data insights
print(f"\nExercise 4: Business Insights")
print("What business decisions can you make from these models?")

print("From salary model:")
coefs = emp_model.coef_
print(f"• Each year of experience worth: ${coefs[0]:,.0f}")
print(f"• Education level impact: ${coefs[1]:,.0f} per level")
print(f"• Performance impact: ${coefs[2]:,.0f} per point")

print(f"\nFrom purchase model:")
for feature, imp in zip(feature_names, prod_model.feature_importances_):
    print(f"• {feature}: {imp:.1%} importance for purchases")

# Exercise 5: Model improvement
print(f"\nExercise 5: Try These Improvements")
print("1. Add more features (like customer location, season)")
print("2. Try different algorithms (SVM, Neural Networks)")
print("3. Collect more data (larger sample size)")
print("4. Handle missing data better")
print("5. Use cross-validation for better evaluation")

print(f"\n🎯 Key Takeaways:")
print(f"✅ Start with simple models, then improve")
print(f"✅ More data usually beats better algorithms")
print(f"✅ Always validate on unseen data")
print(f"✅ Understand what drives predictions")
print(f"✅ Business impact matters more than perfect accuracy")