# Movie Script Dataset

### Load and transform textual script data

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

def load_data(filename1, filename2):
    data1 = pd.read_csv(filename1)
    data2 = pd.read_csv(filename2)
    combined_df = pd.concat([data1, data2], axis=0)
    features = combined_df['script']
    labels   = combined_df['oscar_nominee']
    return features,labels

def build_tfidf(xtrain):
    vectorizer = TfidfVectorizer(max_features=10000, min_df=0.05, max_df=0.5)
    # Fit the vectorizer here
    vectorizer.fit(xtrain)
    return vectorizer

print('Loading data...')
features,labels = load_data('processed_data\\cleaned_oscar_movie_scripts.csv', 'processed_data\\cleaned_non_oscar_movie_scripts.csv')
xtrain, xtest, ytrain, ytest = train_test_split(features, labels, random_state=0, test_size=0.2, stratify=labels)

print('Vectorizing data with tfidf...')
vectorizer = build_tfidf(xtrain)
xtrain = vectorizer.transform(xtrain)
xtest  = vectorizer.transform(xtest)

Loading data...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Vectorizing data with tfidf...


### Only run this cell to remove oscar nominees from non-oscar dataset

In [24]:
def clean_non_oscar_data(oscar_filename, non_oscar_filename):
    data1 = pd.read_csv(oscar_filename)
    data2 = pd.read_csv(non_oscar_filename)
    oscar_movies = set(data1['title'])
    data2 = data2[data2['title'].apply(lambda x: x in oscar_movies) == False]
    del data2['Unnamed: 0']
    data2.to_csv('processed_data\\fixed_cleaned_non_oscar_data.csv', index=False)

# clean_non_oscar_data('processed_data\\cleaned_oscar_movie_scripts.csv', 'processed_data\\cleaned_non_oscar_movie_scripts.csv')
print('done')

done


### Only run this cell to combine script CSVs

In [None]:
def combine_oscar_csvs(oscar_filename, non_oscar_filename):
    data1 = pd.read_csv(oscar_filename)
    data2 = pd.read_csv(non_oscar_filename)
    combined_df = pd.concat([data1, data2], axis=0)
    del combined_df['Unnamed: 0']
    del combined_df['oscar_nominee']
    combined_df = combined_df[['title', 'release_year', 'script']]
    combined_df.to_csv('processed_data\\combined_script_data.csv', index=False)

combine_oscar_csvs('processed_data\\cleaned_oscar_movie_scripts.csv', 'processed_data\\cleaned_non_oscar_movie_scripts.csv')
print('done')

### Topic clustering for Oscar Winning Movies

### Sample prediction

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

print('Predicting...')
lr_clf = LogisticRegression(solver='saga').fit(xtrain, ytrain)
print(lr_clf.score(xtest, ytest))

Predicting...
0.7439024390243902
