In [22]:
# import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [23]:
# load data

movies = pd.read_csv('IMDB Horror movies.csv')
pd.options.display.max_columns = None
movies.head(1)

Unnamed: 0,Title,Year,Genre-Horror,Genre-Thriller,Genre-Sci-Fi,Genre-Mystery,Genre-Romance,Genre-Fantasy,Genre-Comdey,Genre-Drama,Genre-Crime,Genre-War,Genre-Western,Genre-Musical,Genre-Family,Genre-Adventure,Genre-Sport,Genre-History,Release Date,Release Country,Movie Rating,Review Rating,Movie Run Time,Plot,Cast,Language,Filming Locations,Budget
0,Gut,2012.0,Horror,Thriller,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,26-Oct-12,USA,,3.9,91 min,"Directed by Elias. With Jason Vail, Nicholas W...",Jason Vail|Nicholas Wilder|Sarah Schoofs|Kirst...,English,"New York, USA",


## Data Wrangling

In [24]:
# Dropping columns: Filming Locations and Budget

movies= movies.drop(['Filming Locations', 'Budget'], axis = 1)
movies.dropna(inplace=True)
movies.head(1)

Unnamed: 0,Title,Year,Genre-Horror,Genre-Thriller,Genre-Sci-Fi,Genre-Mystery,Genre-Romance,Genre-Fantasy,Genre-Comdey,Genre-Drama,Genre-Crime,Genre-War,Genre-Western,Genre-Musical,Genre-Family,Genre-Adventure,Genre-Sport,Genre-History,Release Date,Release Country,Movie Rating,Review Rating,Movie Run Time,Plot,Cast,Language
3,Treasure Chest of Horrors II,2013.0,Horror,Thriller,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,23-Apr-13,USA,NOT RATED,3.7,82 min,"Directed by M. Kelley, Shawn C. Phillips, Alex...",Veronica Ricci|Nicholas Adam Clark|James Culle...,English


In [25]:
# Split column: Plot

movies1 = movies['Plot'].str.split('.', expand = True).rename(columns = lambda x: 'Directors' + str(x + 1))
movies1.rename(columns = {'Directors1' : 'Directors1', 'Directors2' : 'Cast1', 'Directors3' : 'Plot1'}, inplace = True)
movies1 = movies1[['Directors1', 'Cast1', 'Plot1']]
movies1.head(1)

Unnamed: 0,Directors1,Cast1,Plot1
3,Directed by M,"Kelley, Shawn C","Phillips, Alex Powers"


In [26]:
# Concat datasets: movies and movies1

movies2 = pd.concat([movies, movies1], axis = 1, keys = ['Title', 'Directors'])
movies2.head(1)

Unnamed: 0_level_0,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Directors,Directors,Directors
Unnamed: 0_level_1,Title,Year,Genre-Horror,Genre-Thriller,Genre-Sci-Fi,Genre-Mystery,Genre-Romance,Genre-Fantasy,Genre-Comdey,Genre-Drama,Genre-Crime,Genre-War,Genre-Western,Genre-Musical,Genre-Family,Genre-Adventure,Genre-Sport,Genre-History,Release Date,Release Country,Movie Rating,Review Rating,Movie Run Time,Plot,Cast,Language,Directors1,Cast1,Plot1
3,Treasure Chest of Horrors II,2013.0,Horror,Thriller,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,23-Apr-13,USA,NOT RATED,3.7,82 min,"Directed by M. Kelley, Shawn C. Phillips, Alex...",Veronica Ricci|Nicholas Adam Clark|James Culle...,English,Directed by M,"Kelley, Shawn C","Phillips, Alex Powers"


## Exporting dataframe: newMovies to an csv file.

In [None]:
movies2.to_csv("newMovies.csv", index = False) #Python

# loading Data frime: newMovies

In [27]:
newMovies = pd.read_csv('newMovies.csv')
pd.options.display.max_columns = None
newMovies.head(1)

Unnamed: 0,Title,Year,Genre-Horror,Genre-Thriller,Genre-Sci-Fi,Genre-Mystery,Genre-Romance,Genre-Fantasy,Genre-Comdey,Genre-Drama,Genre-Crime,Genre-War,Genre-Western,Genre-Musical,Genre-Family,Genre-Adventure,Genre-Sport,Genre-History,Release Date,Release Country,Movie Rating,Review Rating,Movie Run Time,Plot,Cast,Language,Directors1,Cast1,Plot1
0,Treasure Chest of Horrors II,2013,Horror,Thriller,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,23-Apr-13,USA,NOT RATED,3.7,82 min,"Directed by M. Kelley, Shawn C. Phillips, Alex...",Veronica Ricci|Nicholas Adam Clark|James Culle...,English,Directed by M,"Kelley, Shawn C","Phillips, Alex Powers"


## Data Wrangling newMovies.csv

In [28]:
newMovies= newMovies.drop(['Plot', 'Cast'], axis = 1)
newMovies.head(1)

Unnamed: 0,Title,Year,Genre-Horror,Genre-Thriller,Genre-Sci-Fi,Genre-Mystery,Genre-Romance,Genre-Fantasy,Genre-Comdey,Genre-Drama,Genre-Crime,Genre-War,Genre-Western,Genre-Musical,Genre-Family,Genre-Adventure,Genre-Sport,Genre-History,Release Date,Release Country,Movie Rating,Review Rating,Movie Run Time,Language,Directors1,Cast1,Plot1
0,Treasure Chest of Horrors II,2013,Horror,Thriller,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,23-Apr-13,USA,NOT RATED,3.7,82 min,English,Directed by M,"Kelley, Shawn C","Phillips, Alex Powers"


In [29]:
newMovies= newMovies.drop(['Title','Genre-Horror', 'Genre-Thriller','Genre-Sci-Fi', 'Genre-Mystery', 'Genre-Romance', 'Genre-Fantasy',
              'Genre-Comdey', 'Genre-Drama', 'Genre-Crime', 'Genre-War', 'Genre-Western', 'Genre-Musical', 'Genre-Family',
              'Genre-Adventure', 'Genre-Sport', 'Genre-History', 'Release Date', 'Movie Rating', 'Movie Run Time',
              'Language', 'Directors1', 'Cast1', 'Plot1'], axis = 1)
newMovies.head(1)

Unnamed: 0,Year,Release Country,Review Rating
0,2013,USA,3.7


## Machine Learning 

In [30]:
# Subsetting: Release Country 
x = newMovies.drop('Release Country', axis=1)
y = newMovies['Release Country']

In [31]:
# Spliting the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)

In [32]:
# Random Tree
nmovie = RandomForestClassifier(n_estimators=500)
nmovie.fit(x_train, y_train)

RandomForestClassifier(n_estimators=500)

In [33]:
moviePredictions = nmovie.predict(x_test)
print(confusion_matrix(y_test, moviePredictions))

[[  0   0   0 ...   2   0   0]
 [  0   0   0 ...   1   0   0]
 [  0   0   0 ...   1   0   0]
 ...
 [  0   0   0 ... 280   0   0]
 [  0   0   0 ...   1   0   0]
 [  0   0   0 ...   1   0   0]]


In [99]:
print(classification_report(y_test, moviePredictions))

                      precision    recall  f1-score   support

           Argentina       0.00      0.00      0.00         1
           Australia       0.00      0.00      0.00         2
             Bahrain       0.00      0.00      0.00         1
             Belgium       0.00      0.00      0.00         2
              Brazil       0.00      0.00      0.00         1
              Canada       0.00      0.00      0.00         4
               Chile       0.00      0.00      0.00         1
               China       0.00      0.00      0.00         1
               Egypt       0.00      0.00      0.00         1
              France       0.00      0.00      0.00         5
             Germany       0.25      0.12      0.17         8
               India       0.00      0.00      0.00         3
             Ireland       0.00      0.00      0.00         1
               Japan       0.00      0.00      0.00        10
              Kuwait       0.00      0.00      0.00         5
       

  _warn_prf(average, modifier, msg_start, len(result))


## Working with ANOVAs

In [34]:
# Loading data frame
anovaMovies = pd.read_csv('newMovies.csv')
pd.options.display.max_columns = None
anovaMovies.head(1)

Unnamed: 0,Title,Year,Genre-Horror,Genre-Thriller,Genre-Sci-Fi,Genre-Mystery,Genre-Romance,Genre-Fantasy,Genre-Comdey,Genre-Drama,Genre-Crime,Genre-War,Genre-Western,Genre-Musical,Genre-Family,Genre-Adventure,Genre-Sport,Genre-History,Release Date,Release Country,Movie Rating,Review Rating,Movie Run Time,Plot,Cast,Language,Directors1,Cast1,Plot1
0,Treasure Chest of Horrors II,2013,Horror,Thriller,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,23-Apr-13,USA,NOT RATED,3.7,82 min,"Directed by M. Kelley, Shawn C. Phillips, Alex...",Veronica Ricci|Nicholas Adam Clark|James Culle...,English,Directed by M,"Kelley, Shawn C","Phillips, Alex Powers"


## Data Wrangling: anovaMovies 

In [35]:
anovaMovies = anovaMovies.drop(['Plot', 'Cast', 'Directors1', 'Cast1', 'Plot1'], axis = 1)
anovaMovies.head(1)

Unnamed: 0,Title,Year,Genre-Horror,Genre-Thriller,Genre-Sci-Fi,Genre-Mystery,Genre-Romance,Genre-Fantasy,Genre-Comdey,Genre-Drama,Genre-Crime,Genre-War,Genre-Western,Genre-Musical,Genre-Family,Genre-Adventure,Genre-Sport,Genre-History,Release Date,Release Country,Movie Rating,Review Rating,Movie Run Time,Language
0,Treasure Chest of Horrors II,2013,Horror,Thriller,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,23-Apr-13,USA,NOT RATED,3.7,82 min,English


In [36]:
# sum the column in anovaMovies

for column in anovaMovies.columns:
    print("\n" + column)
    print(anovaMovies[column].value_counts())


Title
Amnesiac                          2
Silent But Deadly                 2
Shame the Devil                   2
Toxin                             2
Rape Zombie: Lust of the Dead     2
                                 ..
The Demon's Rook                  1
Drink Me                          1
Some Guy Who Kills People         1
I Spit on Your Grave 2            1
Queen of Blood                    1
Name: Title, Length: 1322, dtype: int64

Year
2014    272
2013    258
2015    231
2012    224
2016    179
2017     75
2011     59
2010     16
2009      8
2006      3
2008      3
2007      1
1999      1
1998      1
1991      1
1979      1
1973      1
1923      1
Name: Year, dtype: int64

Genre-Horror
 Horror    1335
Name: Genre-Horror, dtype: int64

Genre-Thriller
NO GENRE     921
 Thriller    404
Thriller      10
Name: Genre-Thriller, dtype: int64

Genre-Sci-Fi
NO GENRE    1235
 Sci-Fi       99
Sci-Fi         1
Name: Genre-Sci-Fi, dtype: int64

Genre-Mystery
NO GENRE    1248
 Mystery      8

In [37]:
anovaMovies1 = anovaMovies['Release Date'].str.split('-', expand = True).rename(columns = lambda x: 'Release Date' + str(x + 1))
anovaMovies1.rename(columns = {'Release Date1' : 'Day', 'Release Date2' : 'Month', 'Release Date3' : 'Year'}, inplace = True)
anovaMovies1 = anovaMovies1[['Day', 'Month']]
anovaMovies1.head(1)

Unnamed: 0,Day,Month
0,23,Apr


In [38]:
anovaMovies2 = anovaMovies['Movie Run Time'].str.split(' ', expand = True).rename(columns = lambda x: 'Movie Run Time' + str(x + 1))
anovaMovies2.rename(columns = {'Movie Run Time1' : 'Movie Length', 'Movie Run Time2' : 'min'}, inplace = True)
anovaMovies2 = anovaMovies2[['Movie Length', 'min']]
anovaMovies2.head(1)

Unnamed: 0,Movie Length,min
0,82,min


In [39]:
# Concat datasets: movies and movies1

anovaMovies3a = pd.concat([anovaMovies, anovaMovies1, anovaMovies2], axis = 1, keys = ['Title', 'Day', 'Movie Length'])
anovaMovies3a.head(1)

Unnamed: 0_level_0,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Day,Day,Movie Length,Movie Length
Unnamed: 0_level_1,Title,Year,Genre-Horror,Genre-Thriller,Genre-Sci-Fi,Genre-Mystery,Genre-Romance,Genre-Fantasy,Genre-Comdey,Genre-Drama,Genre-Crime,Genre-War,Genre-Western,Genre-Musical,Genre-Family,Genre-Adventure,Genre-Sport,Genre-History,Release Date,Release Country,Movie Rating,Review Rating,Movie Run Time,Language,Day,Month,Movie Length,min
0,Treasure Chest of Horrors II,2013,Horror,Thriller,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,23-Apr-13,USA,NOT RATED,3.7,82 min,English,23,Apr,82,min


In [40]:
# Dropping rows without values

anovaMovies3a.dropna(inplace=True)
anovaMovies3a.head(1)

Unnamed: 0_level_0,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Title,Day,Day,Movie Length,Movie Length
Unnamed: 0_level_1,Title,Year,Genre-Horror,Genre-Thriller,Genre-Sci-Fi,Genre-Mystery,Genre-Romance,Genre-Fantasy,Genre-Comdey,Genre-Drama,Genre-Crime,Genre-War,Genre-Western,Genre-Musical,Genre-Family,Genre-Adventure,Genre-Sport,Genre-History,Release Date,Release Country,Movie Rating,Review Rating,Movie Run Time,Language,Day,Month,Movie Length,min
0,Treasure Chest of Horrors II,2013,Horror,Thriller,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,NO GENRE,23-Apr-13,USA,NOT RATED,3.7,82 min,English,23,Apr,82,min


In [147]:
# Exporting dataframe: anovaMovies3 to an csv file.
anovaMovies3a.to_csv("anovaMovies3a.csv", index = False) #Python