In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import FeatureHasher

In [98]:
def missingGraph(dataset, output_folder='missingGraph'):
    categorical_features = dataset.select_dtypes(include=['object']).columns
    print("Categorical Features:")
    print(categorical_features)

    numerical_features = dataset.select_dtypes(include=['number']).columns
    print("\nNumerical Features:")
    print(numerical_features)

    missing_percentage = (dataset.isnull().sum() / len(dataset)) * 100
    categorical_missing = missing_percentage[categorical_features]
    numerical_missing = missing_percentage[numerical_features]

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(categorical_missing.index, categorical_missing, color='blue', label='Categorical')
    ax.bar(numerical_missing.index, numerical_missing, color='orange', label='Numerical')

    ax.set_xlabel('Column Name')
    ax.set_ylabel('Percentage of Missing Values')
    ax.set_title('Percentage of Missing Values in Categorical and Numerical Features')
    ax.legend()

    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, f'missingGraph.png')
    plt.savefig(output_path)
    plt.close()

In [99]:
def outlierBoxplots(dataset, output_folder = 'outlierBoxPlots'):
    fig, ax = plt.subplots(figsize=(12, 8))
    dataset.boxplot(column=list(dataset.select_dtypes(include=['number']).columns), ax=ax)
    ax.set_title('Boxplots for Numerical Features (Identifying Outliers)')
    plt.xticks(rotation=45, ha='right')

    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, f'outlierBoxPlots.png')
    plt.savefig(output_path)
    plt.close()

In [100]:
def distributionPlots(dataset, output_folder='plots'):
    numerical_features = dataset.select_dtypes(include=['number']).columns
    for feature in numerical_features:
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.histplot(data=dataset, x=feature, kde=True)
        ax.set_title(f'Distribution Plot - {feature}')
        plt.tight_layout()
        
        os.makedirs(output_folder, exist_ok=True)
        output_path = os.path.join(output_folder, f'{feature}_distribution_plot.png')
        plt.savefig(output_path)
        plt.close()

In [101]:
def remove_nan(X):
    temp_X = []
    uniqueName = set()

    for i in range(X.shape[0]):
        if pd.notnull(X[i, 0]):
            current_value = X[i, 0]
            if current_value in uniqueName:
                continue
            else:
                uniqueName.add(current_value)

        if not (pd.notnull(X[i, 4]) and pd.notnull(X[i, -3]) and pd.notnull(X[i, -2])):
            continue

        if pd.notnull(X[i, -1]):
            X[i, -1] = (float(X[i, -1].replace('$', '').replace('M', '')) * 1e6)
        if pd.notnull(X[i, 0]):
            X[i, 0] = ''.join([char for char in str(X[i, 0]) if ord(char) < 128])
            X[i, 0] = X[i, 0].strip()
        
        if pd.notnull(X[i, 1]):
            year_digits = ''.join(filter(str.isdigit, str(X[i, 1])))
            X[i, 1] = int(year_digits[:4]) if year_digits else np.nan
        
        if pd.notnull(X[i, 2]):
            X[i, 2] = ''.join([char for char in str(X[i, 2]) if ord(char) < 128])
            X[i, 2] = X[i, 2].strip()
        
        if pd.notnull(X[i, 4]):
            X[i, 4] = ''.join([char for char in str(X[i, 4]) if ord(char) < 128])
            X[i, 4] = X[i, 4].strip()
        
        if pd.notnull(X[i, -4]):
            X[i, -4] = ''.join([char for char in str(X[i, -4]) if ord(char) < 128])
            X[i, -4] = X[i, -4].strip()
        
        if pd.notnull(X[i, -3]):
            X[i, -3] = int(''.join(filter(str.isdigit, str(X[i, -3]))))

        if pd.notnull(X[i, -2]):
            X[i, -2] = int(''.join(filter(str.isdigit, str(X[i, -2]))))
        
        temp_X.append(X[i, :].copy())

    X = np.array(temp_X)
    X = np.delete(X, [4, 5], axis=1)
    num_columns = X.shape[1]
    if num_columns < 7:
        X = np.hstack((X, np.full((X.shape[0], 7 - num_columns), None)))

    return X

In [102]:
def preProcess(X):
    label_encoder = LabelEncoder()
    X[:, 0] = label_encoder.fit_transform(X[:, 0])

    df = pd.DataFrame(X, columns=['MOVIES', 'YEAR', 'GENRE', 'RATING', 'VOTES', 'RunTime', 'Gross'])
    df_genre_split = df['GENRE'].str.get_dummies(', ')
    
    # Drop the original 'GENRE' column
    df = df.drop('GENRE', axis=1)

    # Concatenate the one-hot encoded genres
    df_encoded = pd.concat([df, df_genre_split], axis=1)
    
    # Rearrange the columns
    list_ = list(df_encoded.columns)
    column_order = ['MOVIES', 'YEAR'] + list_[6: len(list_) - 4] + ['RATING', 'VOTES', 'RunTime', 'Gross']

    # Reorder columns
    df_encoded = df_encoded[column_order]

    # Convert back to numpy ndarray
    X_encoded = df_encoded.to_numpy()

    return X_encoded, list(df_encoded.columns)


In [103]:
def pre_processed_csv(X, dataset_columns):
    print(dataset_columns)
    dataset_columns.remove(dataset_columns[-4])
    dataset_columns.remove(dataset_columns[-5])
    
    X = np.delete(X, [-5, -4], axis=1)

    pre_processed_dataset = pd.DataFrame(X, columns=dataset_columns)
    pre_processed_dataset.to_csv('movies_pre_processed.csv', index=False)
    
    return X
    

In [104]:
def trainTestSplit(X, col):
    train_filename='movies_train.csv'
    test_filename='movies_test.csv'

    train_data = []
    test_data = []

    for i in range(X.shape[0]):
        if pd.notnull(X[i, -1]):
            test_data.append(X[i, :])
        else:
            train_data.append(X[i, :])

    train_array = np.array(train_data)
    test_array = np.array(test_data)

    train_df = pd.DataFrame(train_array, columns=[col[i] for i in range(train_array.shape[1])])
    test_df = pd.DataFrame(test_array, columns=[col[i] for i in range(test_array.shape[1])])

    train_df.to_csv(train_filename, index=False)
    test_df.to_csv(test_filename, index=False)


In [105]:
def main():
    dataset = pd.read_csv('movies.csv')

    missingGraph(dataset)
    outlierBoxplots(dataset)
    distributionPlots(dataset)
    
    X = dataset.iloc[:, :].values
    X = remove_nan(X)
    X, current_columns = preProcess(X)
    
    X = pre_processed_csv(X, current_columns)
    trainTestSplit(X, current_columns)

In [106]:
if __name__ == '__main__':
    main()

Categorical Features:
Index(['MOVIES', 'YEAR', 'GENRE', 'ONE-LINE', 'STARS', 'VOTES', 'Gross'], dtype='object')

Numerical Features:
Index(['RATING', 'RunTime'], dtype='object')
['MOVIES', 'YEAR', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'RATING', 'VOTES', 'RunTime', 'Gross']
