# Movie Ratings Analysis: Letterboxd Dataset

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
file_path = '../datasets/Letterbox Movie Classification Dataset.csv'
df = pd.read_csv(file_path, index_col=0)
df.head()

## Data Cleaning
- Check for missing values
- Convert columns to appropriate types
- Parse genres from string to list

In [None]:
# Data cleaning functions
def parse_genres(genre_str):
    try:
        return eval(genre_str)
    except:
        return []

def clean_dataframe(df):
    df = df.copy()
    df['Genres'] = df['Genres'].apply(parse_genres)
    df['Runtime'] = pd.to_numeric(df['Runtime'], errors='coerce')
    df['Average_rating'] = pd.to_numeric(df['Average_rating'], errors='coerce')
    return df

df = clean_dataframe(df)
df.info()

## Grouping and Aggregation
- By genre: count, average rating
- By year: count, average rating
- By rating: distribution

In [None]:
# Group by genre: count and average rating
def genre_stats(df):
    genre_list = []
    for _, row in df.iterrows():
        for genre in row['Genres']:
            genre_list.append({'Genre': genre, 'Average_rating': row['Average_rating']})
    genre_df = pd.DataFrame(genre_list)
    return genre_df.groupby('Genre').agg({'Average_rating': ['count', 'mean']}).sort_values(('Average_rating', 'count'), ascending=False)

genre_stats(df)

In [None]:
# Rating distribution
sns.histplot(df['Average_rating'].dropna(), bins=20, kde=True)
plt.title('Distribution of Average Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Count')
plt.show()

## Visualization by Genre

In [None]:
# Top genres by number of films
genre_df = genre_stats(df).reset_index()
sns.barplot(data=genre_df.head(10), x=('Average_rating', 'count'), y='Genre')
plt.title('Top 10 Genres by Number of Films')
plt.xlabel('Number of Films')
plt.ylabel('Genre')
plt.show()

In [None]:
# Average rating by genre
sns.barplot(data=genre_df.sort_values(('Average_rating', 'mean'), ascending=False).head(10), x=('Average_rating', 'mean'), y='Genre')
plt.title('Top 10 Genres by Average Rating')
plt.xlabel('Average Rating')
plt.ylabel('Genre')
plt.show()

## Further Analysis
- Grouping by director, language, or studio
- Correlation between runtime and rating

In [None]:
# Correlation between runtime and average rating
sns.scatterplot(data=df, x='Runtime', y='Average_rating', alpha=0.5)
plt.title('Runtime vs. Average Rating')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Average Rating')
plt.show()

## Export to Google Sheets
Export analysis results to Google Sheets using gspread.

In [None]:
# Install gspread and gspread_dataframe if not already installed
# !pip install gspread gspread_dataframe oauth2client

import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials

# Define the scope and authenticate
scope = [
    'https://spreadsheets.google.com/feeds',
    'https://www.googleapis.com/auth/drive'
]

creds = ServiceAccountCredentials.from_json_keyfile_name('path/to/your/service_account.json', scope)
client = gspread.authorize(creds)

# Open the Google Sheet by URL
sheet = client.open_by_url('https://docs.google.com/spreadsheets/d/14c7eIw_OK7T7QanFg2khj8uakrGPokMIvggoBMlFt-Q/edit?usp=sharing')
worksheet = sheet.sheet1

# Example: Export the genre statistics DataFrame
genre_df = genre_stats(df).reset_index()
set_with_dataframe(worksheet, genre_df)

print('Data exported to Google Sheets successfully.')