# Movie Analysis Project
This notebook covers the following sections:
1. Data Loading and Preprocessing
2. Exploratory Data Analysis (EDA)
3. Linear Regression Modeling and Evaluation
4. Conclusion
5. Tkinter GUI for Interactive Movie Categorization


In [None]:
import pandas as pd
import numpy as np

def load_and_preprocess(file_path):
    df = pd.read_csv(file_path)
    df['Rating(10)'] = pd.to_numeric(df['Rating(10)'], errors='coerce')
    df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',', ''), errors='coerce')
    df['Rating(10)'].fillna(0, inplace=True)
    df['Votes'].fillna(0, inplace=True)
    return df

file_path = 'indian movies.csv'  # Replace with your dataset path
df = load_and_preprocess(file_path)
df.head()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def perform_eda(df):
    print(df.describe(include='all'))
    
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    sns.histplot(df['Rating(10)'], kde=True)
    plt.title('Distribution of Ratings')
    plt.subplot(1, 2, 2)
    sns.histplot(df['Votes'], kde=True, bins=30)
    plt.title('Distribution of Votes')
    plt.show()

    print(df.corr())

perform_eda(df)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def train_and_evaluate_regression_model(df):
    X = df[['Votes']]  # Predictor variable
    y = df['Rating(10)']  # Target variable
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'RMSE: {rmse}')
    return model

model = train_and_evaluate_regression_model(df)


# Conclusion
(Write your conclusion here)


In [None]:
# Tkinter GUI (Uncomment to run)
# import tkinter as tk
# from tkinter import ttk

# def show_category(movies_df):
#     def display_category():
#         selected_movie = movie_var.get()
#         category = movies_df[movies_df['Movie Name'] == selected_movie]['Category'].iloc[0]
#         result_label.config(text=f'Category: {category}')

#     root = tk.Tk()
#     root.title('Movie Hit or Flop Predictor')

#     movie_var = tk.StringVar()
#     movie_dropdown = ttk.Combobox(root, textvariable=movie_var, values=movies_df['Movie Name'].unique())
#     movie_dropdown.grid(column=0, row=0, padx=10, pady=10)

#     show_button = tk.Button(root, text="Show Category", command=display_category)
#     show_button.grid(column=1, row=0, padx=10, pady=10)

#     result_label = tk.Label(root, text="")
#     result_label.grid(column=0, row=1, columnspan=2)

#     root.mainloop()

# show_category(df)


In [None]:

    votes_thresholds = {'flop': votes_quantiles[0.25], 'average_hit': votes_quantiles[0.75]}
    df['Category'] = df.apply(categorize_movie, args=(rating_thresholds, votes_thresholds), axis=1)

    perform_eda(df)
    model = train_and_evaluate_regression_model(df)

    # Uncomment the next line to run the GUI
    # show_category(df)

# Conclusion of findings (write your conclusion here)

if __name__ == "__main__":
    main()
