### Load and transform textual script data

In [23]:
import pandas as pd

# Get scripts data
script_csv_path = "./processed_data/combined_script_data.csv"
script_data = pd.read_csv(script_csv_path)
script_data = script_data.drop(columns=["release_year"])
# display(script_data)

# Get Oscar labels
oscar_labels_path = "./processed_data/oscar_data.csv"
oscar_label = pd.read_csv(oscar_labels_path).rename(columns={"entity": "title", "category": "nominated", "winner": "won"})
oscar_label = oscar_label.drop(columns=["year"])
oscar_label["nominated"] = True
# display(oscar_label)

# Merge script and Oscar data
script_oscar_df = pd.merge(script_data, oscar_label, how="outer", on="title")
script_oscar_df = script_oscar_df.dropna(subset=['script'])
script_oscar_df = script_oscar_df.fillna(False)
# display(script_oscar_df)

script_oscar_df['nominated'].astype('bool')
script_oscar_df['won'].astype('bool')
# print(script_oscar_df.dtypes)
# display(script_oscar_df)

script_oscar_df = script_oscar_df[script_oscar_df['nominated'] == script_oscar_df['won']].drop_duplicates()
script_oscar_df = script_oscar_df.drop(columns=['nominated'])
script_oscar_df['won'] = script_oscar_df['won'].astype(int)
display(script_oscar_df)

script_oscar_df.reset_index()
print('Done')

Unnamed: 0,title,script,won
1,When Worlds Collide,"['needl', 'heaven', 'haystack', 'star', 'heave...",1
5,Reds,"['rememb', 'im', 'begin', 'forget', 'peopl', '...",1
12,Undefeated,"['let', 'start', 'right', 'guard', 'shot', 'lo...",1
15,Seven Brides for Seven Brothers,"['deliv', 'perfectionand', 'dont', 'brag', 'd'...",1
25,Henry V,"['oh', 'muse', 'ascend', 'brightest', 'heaven'...",1
...,...,...,...
7441,The Roommate,"['hi', 'hi', 'sara', 'matthew', 'sara', 'matth...",0
7442,Night of the Comet,"['record', 'time', 'swung', 'univers', 'ellipt...",0
7443,Paddington,"['darkest', 'peru', 'vast', 'unexplor', 'wilde...",0
7444,Jumanji: Welcome to the Jungle,"['ocean', 'wave', 'crash', 'seagul', 'squawk',...",0


Done


In [24]:
features,labels = script_oscar_df['script'],script_oscar_df['won']
print('done')

done


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

def build_tfidf(xtrain):
    vectorizer = TfidfVectorizer(max_features=10000, min_df=0.05, max_df=0.5)
    # Fit the vectorizer here
    vectorizer.fit(xtrain)
    return vectorizer

xtrain, xtest, ytrain, ytest = train_test_split(features, labels, random_state=0, test_size=0.2, stratify=labels)

print('Vectorizing data with tfidf...')
vectorizer = build_tfidf(xtrain)
xtrain = vectorizer.transform(xtrain)
xtest  = vectorizer.transform(xtest)
print('done')

Vectorizing data with tfidf...
done


### Exploratory Data Analysis

In [None]:
# Do feature importance exploration

# Do Confusion Matrix exploration

### Visualize data

### ML prediction

In [26]:
from sklearn.linear_model import LogisticRegression

print('Predicting...')
lr_clf = LogisticRegression(solver='saga').fit(xtrain, ytrain)
print(lr_clf.score(xtest, ytest))
print('done')

Predicting...
0.8390625
done


