In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from plotnine import *

### Data Understanding

In [None]:
# Reading the Data
df = pd.read_csv("/kaggle/input/train_data.csv")
df_test = pd.read_csv("/kaggle/input/test_data_with_inputs.csv")

In [None]:
# Displaying the first 10 records
df.head(10)

In [None]:
# Relationship between the imdb score and the profit made by the movie
ggplot(aes(x='imdb_score', y='Profit'), data=df) +\
    geom_line() +\
    stat_smooth(colour='blue', span=1)

In [None]:
# Top 20 actors of movies based on the imdb rating of the movies

plt.figure(figsize=(10, 8))

# new dataframe with top 20 values
new_df = df.sort_values(by ='imdb_score' , ascending=False)
new_df = new_df.head(20)

# plotting
ax=sns.pointplot(new_df['actor_1_name'], new_df['imdb_score'], hue=new_df['movie_title'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# Correlation with heat map - to find to which feature is similar to which other
# those above 0.5 corr score have significant overlap in information

# calc corr
corr = df.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))

# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr, mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

### Pre-processing

In [None]:
### Tip 1 - Handle different kinds of data types

### Handling categorical data
df2 = pd.get_dummies(data = df, columns = ['content_rating'] , prefix = ['content_rating'] , drop_first = True)

### Handle different kinds of data - text etc., as you see fit

In [1]:
### Tip 2 - Based on above data analysis, choose all or relevant features
### Tip 3 - Feel free to extract most important features using PCA, regularisation, above correlation heatmaps etc.
### Feel free to convert the text to textual feature vectors, and use those as input too.

In [None]:
# Tip 4 - pandas data frames can be directly used in train and test split creation
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [None]:
# Tip 5- Don't forget feature scaling; check K-NN bootcamp text

### Model Selection

In [2]:
### Experiment with different regression models
### https://scikit-learn.org/stable/supervised_learning.html

### Evaluation and saving output

In [None]:
# Creating output file for submission - Template Code

test = pd.read_csv('/kaggle/input/test-imdb/test_data_with_inputs.csv')

# Convert all submission data to same input format as done for train data
# run prediction as y_pred = model.predict(X_test)
# y_pred contains IMDB scores

submission = pd.DataFrame({'s_no':test.s_no, 'output':y_pred[..., 0]}).set_index('s_no')
submission.to_csv('output_submission.csv')