In [1]:

# To classify movies into different genres based on their attributes, we'll follow these steps:

# Data Preprocessing: Load the dataset, handle missing values, and preprocess categorical variables.
# Feature Engineering: Extract relevant features from the dataset.
# Split the dataset into training and testing sets.
# Build and Train Classification Models: We'll experiment with different algorithms such as Logistic Regression, Decision Trees, and Neural Networks.
# Evaluate Model Performance: Assess the accuracy of each model using appropriate evaluation metrics.
# Choose the best-performing model for genre classification.


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [3]:

# load Data
from google.colab import files
uploaded = files.upload()


Saving movies.csv to movies.csv


In [4]:
data = pd.read_csv("movies.csv")

In [5]:
# Drop rows with missing values
data.dropna(inplace=True)


In [6]:
data.head(3) # quick view of data

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0


In [7]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ["genre", "director", "writer", "star", "country", "company"]
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])


In [8]:
# Split features and target variable
X = data.drop(["name", "rating", "released", "genre"], axis=1)
y = data["genre"]


In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Build and train classification models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy}")


Logistic Regression Accuracy: 0.25990783410138246
Decision Tree Accuracy: 0.3271889400921659


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Build and train Logistic Regression model with increased max_iter
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f"Logistic Regression Accuracy: {accuracy_logistic}")

Logistic Regression Accuracy: 0.27373271889400924


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train Logistic Regression model on scaled features
logistic_model_scaled = LogisticRegression(max_iter=1000)
logistic_model_scaled.fit(X_train_scaled, y_train)
y_pred_logistic_scaled = logistic_model_scaled.predict(X_test_scaled)
accuracy_logistic_scaled = accuracy_score(y_test, y_pred_logistic_scaled)
print(f"Scaled Logistic Regression Accuracy: {accuracy_logistic_scaled}")

Scaled Logistic Regression Accuracy: 0.4073732718894009


In [14]:
# Scaling the features using standard scaling has indeed improved the Logistic Regression model's accuracy significantly.
# This indicates that feature scaling helped in better convergence of the optimization algorithm and
# consequently improved the model's performance.

In [17]:
# Example Neural Network
model_nn = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500)
model_nn.fit(X_train, y_train)
y_pred_nn = model_nn.predict(X_test)
accuracy_nn = accuracy_score(y_test, y_pred_nn)
print(f"Neural Network Accuracy: {accuracy_nn}")

Neural Network Accuracy: 0.20921658986175115


In [19]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy}")

print(f"Scaled Logistic Regression Accuracy: {accuracy_logistic_scaled}")

Logistic Regression Accuracy: 0.25990783410138246
Decision Tree Accuracy: 0.3142857142857143
Scaled Logistic Regression Accuracy: 0.4073732718894009


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Scaled Logistic Regression Accuracy is the best performing

In [22]:
# Sample prediction
sample_index = 0  # Choose any index from the test set
sample_features = X_test_scaled[sample_index].reshape(1, -1)
sample_true_label = y_test.iloc[sample_index]
predicted_genre = logistic_model_scaled.predict(sample_features)[0]


In [23]:
# Print sample prediction
print(f"Sample True Genre: {sample_true_label}")
print(f"Predicted Genre: {predicted_genre}")


Sample True Genre: 4
Predicted Genre: 4


In [24]:
# Sample prediction resulted in the true genre being 4 and the predicted genre also being 4.
# This indicates that the model's prediction matched the true label for this particular sample.

In [25]:
# Save the model
# import joblib

# model_filename = "scaled_logistic_regression_model.pkl"
# joblib.dump(logistic_model_scaled, model_filename)
# print(f"Model saved as {model_filename}")