# Data description

In [None]:
# import glob
import numpy as np
import os
import pandas as pd
import re
import requests
import tarfile

from collections import Counter

import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("data/processed/train.csv")
test = pd.read_csv("data/processed/test.csv")

## Unique movies

In [None]:
common_ids = set(train['id'].unique()) & set(test['id'].unique())
count_common_ids = len(common_ids)

print(f"Number of unique id values that appear in both train and test: {count_common_ids}")

## Ratings

In [None]:
plt.figure(figsize=(10, 6))

ratings_range = range(1, 11)
x = np.array(ratings_range)
width = 0.35

train_counts = [train['rating'].value_counts().get(i, 0) for i in ratings_range]
test_counts = [test['rating'].value_counts().get(i, 0) for i in ratings_range]

plt.bar(x - width/2, train_counts, width, color='cornflowerblue', alpha=0.7, label='Train')
plt.bar(x + width/2, test_counts, width, color='goldenrod', alpha=0.7, label='Test')

plt.xlabel('Ratings')
plt.ylabel('Counts')
plt.title('Distribution of Ratings in Train and Test Sets')
plt.xticks(ratings_range)
plt.legend()

plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

## Length

In [None]:
def get_text_stats(df):
    # Character length
    df['char_length'] = df['text'].apply(len)
    char_min = df['char_length'].min()
    char_max = df['char_length'].max()
    char_avg = df['char_length'].mean()
    
    # Word length
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    word_min = df['word_count'].min()
    word_max = df['word_count'].max()
    word_avg = df['word_count'].mean()
    
    return {
        'char_min': char_min,
        'char_max': char_max,
        'char_avg': round(char_avg, 0),
        'word_min': word_min,
        'word_max': word_max,
        'word_avg': round(word_avg, 0)
    }

train_stats = get_text_stats(train)
test_stats = get_text_stats(test)

# Create a comparison dataframe
comparison = pd.DataFrame({
    'Train': [train_stats['char_min'],
              train_stats['char_max'],
              train_stats['char_avg'],
              train_stats['word_min'],
              train_stats['word_max'],
              train_stats['word_avg']],
    'Test': [test_stats['char_min'],
             test_stats['char_max'],
             test_stats['char_avg'],
             test_stats['word_min'],
             test_stats['word_max'],
             test_stats['word_avg']]}, 
    index=['Min Characters',
          'Max Characters',
          'Avg Characters',
          'Min Words',
          'Max Words',
          'Avg Words'])

comparison.style.set_caption('Text Length Statistics').format("{:.0f}")

## Correlation

In [None]:
import scipy.stats as stats

correlation, p_value = stats.pearsonr(train['rating'], train['word_count'])

print(f"Pearson correlation coefficient: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")

In [None]:
word_count_stats = train.groupby('rating')['word_count'].agg(['mean', 'median', 'count'])
print("\nWord count statistics by rating:")
print(word_count_stats)

In [None]:
plt.figure(figsize=(12, 6))
plt.boxplot([train[train['rating'] == i]['word_count'] for i in range(0, 11)], 
           tick_labels=range(0, 11))
plt.xlabel('Rating')
plt.ylabel('Word Count')
plt.title('Word Count Distribution by Rating')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import ListedColormap

# Make sure you have the word_count column
if 'word_count' not in train.columns:
    train['word_count'] = train['text'].apply(lambda x: len(str(x).split()))

# Create a figure
plt.figure(figsize=(12, 8))

# Define a colormap
colors = ['#2c7bb6', '#abd9e9', '#ffffbf', '#fdae61', '#d7191c']  # Blue to Red
cmap = ListedColormap(colors)

# For each rating value
for rating in sorted(train['rating'].unique()):
    # Get the subset of data for this rating
    subset = train[train['rating'] == rating]
    
    if len(subset) > 0:
        # Calculate quintiles for this rating's word counts
        quintiles = pd.qcut(subset['word_count'], q=5, labels=False, duplicates='drop')
        
        # If there are enough data points to create quintiles
        if len(np.unique(quintiles)) > 1:
            # Plot points with color based on quintile
            plt.scatter(
                [rating] * len(subset),   # x-values (all the same rating)
                subset['word_count'],     # y-values (word counts)
                c=quintiles,              # color based on quintile
                cmap=cmap,                # color map
                alpha=0.7,                # transparency
                s=30                      # size of points
            )
        else:
            # Not enough data points for quintiles, use single color
            plt.scatter(
                [rating] * len(subset),
                subset['word_count'],
                color=colors[2],  # Middle color
                alpha=0.7,
                s=30
            )

# Add a trend line
x = train['rating']
y = train['word_count']
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(np.unique(x), p(np.unique(x)), "k--", lw=2)

# Calculate correlation coefficient
correlation = np.corrcoef(train['rating'], train['word_count'])[0, 1]

# Add labels and title
plt.xlabel('Rating', fontsize=12)
plt.ylabel('Word Count', fontsize=12)
plt.title(f'Correlation between Rating and Word Count (r={correlation:.4f})', fontsize=14)
plt.xticks(np.arange(0, 11, 1))  # Set x-axis ticks to integers 0-10
plt.grid(True, alpha=0.3)

# Add a color bar legend
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
sm = ScalarMappable(cmap=cmap, norm=Normalize(vmin=0, vmax=4))
sm.set_array([])
cbar = plt.colorbar(sm)
cbar.set_label('Quintile within Rating Group', fontsize=10)
cbar.set_ticks([0.4, 1.2, 2, 2.8, 3.6])
cbar.set_ticklabels(['1st', '2nd', '3rd', '4th', '5th'])

plt.tight_layout()
plt.show()