In [1136]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import warnings

In [1137]:
warnings.filterwarnings('ignore')

In [1138]:
data = pd.read_csv('tmdb_5000_movies.csv')
data2 = pd.read_csv('tmdb_5000_credits.csv')
data.head(1)

In [None]:
data['profit'] = (data.revenue - data.budget)
data.head(1)

In [None]:
data2.head(2)

In [None]:
columns = data.columns.to_list()
good_list = columns[8:9] + columns[11:12] + columns[13:14] + columns[-4:-2] + columns[-1:]
new_data = data[good_list]

In [None]:
new_data.head(3)

In [None]:
new_data.dtypes

In [None]:
new_data.index = new_data.title
new_data.drop('title', axis=1, inplace=True)

In [None]:
new_data.head(3)

In [None]:
split = new_data.release_date.str.split('-')
new_data.release_date = split.str.get(0)

In [None]:
new_data.release_date.value_counts()

In [None]:
new_data.release_date.unique()

In [None]:
new_data.isna().sum()

In [None]:
new_data.dropna(subset=['release_date', 'runtime'], inplace=True)

In [None]:
new_data.isna().sum()

In [None]:
new_data.release_date = new_data.release_date.astype('int64')

In [None]:
new_data.info()

In [None]:
sns.heatmap(new_data.corr(), annot=True, cmap='mako', linewidths=2, linecolor='white')
plt.title('Heatmap')
plt.show()
plt.clf()

In [None]:
#upper  = new_data.quantile(0.98)
#lower = new_data.quantile(0.02)
#new_data = new_data[(new_data < upper) & (new_data > lower)]

In [None]:
X = new_data[['popularity', 'release_date', 'runtime', 'profit']]
y = new_data['vote_average']

In [None]:
X.tail()

In [None]:
y.tail()

In [None]:
sns.heatmap(X.corr(), annot=True, cmap='Greens', linewidths=2, linecolor='white')
plt.title('Heatmap')
plt.show()
plt.clf()

In [None]:
sns.pairplot(X)
plt.show()
plt.clf()

In [None]:
sns.boxplot(X)
plt.show()
plt.clf()

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
predictors = X

In [None]:
predictors, y

In [None]:
sns.pairplot(pd.DataFrame(predictors), diag_kind='kde')
plt.show()
plt.clf()

In [None]:
sns.boxplot(pd.DataFrame(predictors))
plt.show()
plt.clf()

In [None]:
y = np.array(y)
pred_rate = y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(predictors, pred_rate, test_size=0.3, random_state=50)

In [None]:
regressor = KNeighborsRegressor(n_neighbors=40, weights='uniform')
model = regressor.fit(X_train, y_train)

In [None]:
model.predict([[ 4.05206402,  0.52658122,  2.44254103, 18.36714242],
               [-0.63047829,  0.84882375,  0.58077638, -0.39159292],
               [-0.64894119,  0.76826311, -0.39443368, -0.39159292],
               [-0.61522296,  0.20433869, -0.74905552, -0.39159292]])

In [None]:
y_pred = model.predict(X_test)

In [None]:
scores_and_k = []
accuracies = []
for k in range(1, 100):
  regressor = KNeighborsRegressor(n_neighbors = k)
  regressor.fit(X_train, y_train)
  a = regressor.score(X_test, y_test)
  print(f'Accuracy: {a}')
  scores_and_k.append([a, k])
  accuracies.append(a)
print(f'Max Accuracy: {max(scores_and_k)[0]}, number of neighbors: {max(scores_and_k)[1]}')


In [None]:
k_list = list(range(1, 100))
sns.lineplot(x=k_list, y=accuracies)
plt.xlabel('Number of "k" nearest neighbors')
plt.ylabel('Validation Accuracy')
plt.title('Movie-rating Accuracy')
plt.plot(max(scores_and_k)[1], max(scores_and_k)[0], '-*')
plt.show()
plt.clf()