# Week2:
# a, Movie Genre Prediction

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv("movie.csv")

# Encode categorical features
label_encoder = LabelEncoder()
df['language'] = label_encoder.fit_transform(df['language'])
df['genre'] = label_encoder.fit_transform(df['genre'])
df['director'] = label_encoder.fit_transform(df['director'])

# Remove rare classes with fewer than 2 samples
class_counts = df['genre'].value_counts()
rare_classes = class_counts[class_counts < 2].index
df = df[~df['genre'].isin(rare_classes)]

# Features and target
X = df[['duration', 'language', 'average_rating', 'number_of_reviews', 'year', 'budget', 'revenue']]
y = df['genre']

# Check class distribution
print("Class distribution in the target variable:")
print(df['genre'].value_counts())

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Train a classifier with class weights to handle imbalance
clf = RandomForestClassifier(random_state=42, class_weight="balanced")
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate using classification report with zero_division parameter
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

Class distribution in the target variable:
1    3
0    2
Name: genre, dtype: int64
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



# b, Sports Performance Analysis

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv("sports.csv")

# Features and target
X = df[['accuracy', 'speed', 'stamina', 'age']].copy()  # Ensure X is a copy
y = df['performance'].copy()  # Ensure y is a copy

# Include outliers
outlier = pd.DataFrame([[200, 15, 150, 30]], columns=X.columns)
X = pd.concat([X, outlier], ignore_index=True)

# Add a corresponding target value for the outlier using pd.concat
y = pd.concat([y, pd.Series(['excellent'])], ignore_index=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train k-NN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Predictions
y_pred = knn.predict(X_test)

# Evaluate with zero_division set to 1 to avoid warnings
print(classification_report(y_test, y_pred, zero_division=1))


              precision    recall  f1-score   support

     Average       0.00      1.00      0.00       0.0
   Excellent       1.00      0.00      0.00       1.0
        Good       1.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.67      0.33      0.00       2.0
weighted avg       1.00      0.00      0.00       2.0

