In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../Resources/cleaned_data.csv")
df.head()

Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Directors,Genres,Country,Runtime,production_company,Top_Genres,Top_Director
0,Inception,2010,13+,8.8,8.7,1,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller",United States,148.0,Warner Bros.,Action,Christopher Nolan
1,The Matrix,1999,18+,8.7,8.7,1,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,136.0,,Action,Lana Wachowski
2,Avengers: Infinity War,2018,13+,8.5,8.4,1,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,149.0,,Action,Anthony Russo
3,Back to the Future,1985,7+,8.5,9.6,1,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,116.0,,Adventure,Robert Zemeckis
4,"The Good, the Bad and the Ugly",1966,18+,8.8,9.7,1,0,1,0,Sergio Leone,Western,Italy,161.0,,Western,Sergio Leone


In [3]:
df = df[["Year", "Country", "Runtime", "Top_Genres", "Rotten Tomatoes"]]
df = df.rename(columns={"Top_Genres":"Genre"})
df = df.dropna()
df["Runtime"] = df[["Runtime"]].round(0).astype(int)
df["Rotten Tomatoes"] = (df["Rotten Tomatoes"]*10).round(0).astype(int)
df.head()

Unnamed: 0,Year,Country,Runtime,Genre,Rotten Tomatoes
0,2010,United States,148,Action,87
1,1999,United States,136,Action,87
2,2018,United States,149,Action,84
3,1985,United States,116,Adventure,96
4,1966,Italy,161,Western,97


In [4]:
df["Genre"].unique()

array(['Action', 'Adventure', 'Western', 'Animation', 'Biography',
       'Drama', 'Crime', 'Comedy', 'Documentary', 'Family', 'Horror',
       'Thriller', 'Mystery', 'Fantasy', 'Romance', 'Short', 'Sci-Fi',
       'Sport', 'War', 'History', 'Musical'], dtype=object)

In [5]:
cleanup_genre = {"Genre": {'Action': 5, 'Adventure': 5, 'Western': 3, 'Animation': 7, 'Biography': 1,
       'Drama': 8, 'Crime': 5, 'Comedy': 4, 'Documentary': 1, 'Family': 8, 'Horror': 6,
       'Thriller': 6, 'Mystery': 5, 'Fantasy': 7, 'Romance': 8, 'Short': 7, 'Sci-Fi': 6,
       'Sport': 2, 'Reality-TV': 4, 'Musical': 7, 'Music': 7, 'War': 3, 'History': 1,
       'Film-Noir': 6, 'Talk-Show': 4, 'Game-Show': 4}}

In [6]:
df = df.replace(cleanup_genre)
df.head()

Unnamed: 0,Year,Country,Runtime,Genre,Rotten Tomatoes
0,2010,United States,148,5,87
1,1999,United States,136,5,87
2,2018,United States,149,5,84
3,1985,United States,116,5,96
4,1966,Italy,161,3,97


In [7]:
df.loc[df["Country"] == "United States", "Country"] = 1
df.head()

Unnamed: 0,Year,Country,Runtime,Genre,Rotten Tomatoes
0,2010,1,148,5,87
1,1999,1,136,5,87
2,2018,1,149,5,84
3,1985,1,116,5,96
4,1966,Italy,161,3,97


In [8]:
df.loc[df["Country"] != 1, "Country"] = 0
df["Country"] = df["Country"].astype(int)
df.head()

Unnamed: 0,Year,Country,Runtime,Genre,Rotten Tomatoes
0,2010,1,148,5,87
1,1999,1,136,5,87
2,2018,1,149,5,84
3,1985,1,116,5,96
4,1966,0,161,3,97


In [9]:
df.loc[df["Rotten Tomatoes"] < 60, "Rotten Tomatoes"] = 0
df.loc[df["Rotten Tomatoes"] >= 60, "Rotten Tomatoes"] = 1

In [10]:
df["Rotten Tomatoes"].value_counts()

1    3219
0    1894
Name: Rotten Tomatoes, dtype: int64

In [11]:
df.head()

Unnamed: 0,Year,Country,Runtime,Genre,Rotten Tomatoes
0,2010,1,148,5,1
1,1999,1,136,5,1
2,2018,1,149,5,1
3,1985,1,116,5,1
4,1966,0,161,3,1


In [12]:
target = df["Rotten Tomatoes"].values.reshape(-1, 1)
target_names = ["Rotten", "Fresh"]
data = df.drop("Rotten Tomatoes", axis=1)
feature_names = data.columns

data.head()

Unnamed: 0,Year,Country,Runtime,Genre
0,2010,1,148,5
1,1999,1,136,5
2,2018,1,149,5
3,1985,1,116,5
4,1966,0,161,3


In [13]:
df.dtypes

Year               int64
Country            int64
Runtime            int64
Genre              int64
Rotten Tomatoes    int64
dtype: object

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3)

In [15]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

  rf = rf.fit(X_train, y_train)


0.6140808344198174

In [16]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.5212594814555525, 'Runtime'),
 (0.32370160928260816, 'Year'),
 (0.12673462688734882, 'Genre'),
 (0.028304282374490602, 'Country')]

In [17]:
y_test[0:10]

array([[1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1]])

In [18]:
rf.predict(X_test[0:10])

array([1, 1, 0, 0, 1, 1, 1, 1, 1, 1])