**TASK 2**

In [None]:
import zipfile

# Unzipping the dataset
zip_path = '/content/ml-latest-small.zip'
extract_path = '/content/ml-latest-small/'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List files to check the extracted content
import os
print(os.listdir(extract_path))

['ml-latest-small']


In [None]:
import os

# Verify the files in the extracted folder
print("Extracted files:", os.listdir('/content/ml-latest-small/'))

Extracted files: ['ml-latest-small']


In [None]:
import pandas as pd

# Correct file paths after extraction
ratings_path = '/content/ml-latest-small/ml-latest-small/ratings.csv'
movies_path = '/content/ml-latest-small/ml-latest-small/movies.csv'

# Load the data
ratings_df = pd.read_csv(ratings_path)
movies_df = pd.read_csv(movies_path)

# Display the first few rows of each dataset
print("Ratings Data:")
print(ratings_df.head())

print("\nMovies Data:")
print(movies_df.head())

Ratings Data:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Movies Data:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [None]:
ratings_df.info()
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


**Step 1: Data Preprocessing**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Merge the ratings and movies data on movieId
data = pd.merge(ratings_df, movies_df, on='movieId')

# Check for missing values
print(data.isnull().sum())

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64


**Step 2: Feature Engineering**

In [None]:
# Split the data into features (X) and target (y)
X = data[['genres']]
y = data['rating']

# OneHotEncode the 'genres' column
preprocessor = ColumnTransformer(
    transformers=[
        ('genres', OneHotEncoder(), ['genres'])
    ])

# Create a pipeline with preprocessing and the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model_pipeline.fit(X_train, y_train)

**Step 3: Model Evaluation**

In [None]:
# OneHotEncode the 'genres' column, with handling for unknown categories
preprocessor = ColumnTransformer(
    transformers=[
        ('genres', OneHotEncoder(handle_unknown='ignore'), ['genres'])
    ])

# Create a pipeline with preprocessing and the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 1.0163541669228455
R-squared: 0.0760759645724206
