# Movie Script Dataset

### Load and transform textual script data

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

def load_data(filename1, filename2):
    data1 = pd.read_csv(filename1)
    data2 = pd.read_csv(filename2)
    combined_df = pd.concat([data1, data2], axis=0)
    features = combined_df['script']
    labels   = combined_df['oscar_nominee']
    return features,labels

def build_tfidf(xtrain):
    vectorizer = TfidfVectorizer(max_features=10000, min_df=0.05, max_df=0.5)
    # Fit the vectorizer here
    vectorizer.fit(xtrain)
    return vectorizer

print('Loading data...')
features,labels = load_data('cleaned_oscar_movie_scripts.csv', 'cleaned_non_oscar_movie_scripts.csv')
xtrain, xtest, ytrain, ytest = train_test_split(features, labels, random_state=0, test_size=0.2, stratify=labels)

print('Vectorizing data with tfidf...')
vectorizer = build_tfidf(xtrain)
xtrain = vectorizer.transform(xtrain)
xtest  = vectorizer.transform(xtest)

Loading data...
Vectorizing data with tfidf...
Predicting...
0.7052730696798494


### Topic clustering for Oscar Winning Movies

### Sample prediction

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

print('Predicting...')
lr_clf = LogisticRegression(solver='saga').fit(xtrain, ytrain)
print(lr_clf.score(xtest, ytest))

Predicting...
0.7052730696798494
