In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
def missingGraph(dataset, output_folder='missingGraph'):
    categorical_features = dataset.select_dtypes(include=['object']).columns
    print("Categorical Features:")
    print(categorical_features)

    numerical_features = dataset.select_dtypes(include=['number']).columns
    print("\nNumerical Features:")
    print(numerical_features)

    missing_percentage = (dataset.isnull().sum() / len(dataset)) * 100
    categorical_missing = missing_percentage[categorical_features]
    numerical_missing = missing_percentage[numerical_features]

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(categorical_missing.index, categorical_missing, color='blue', label='Categorical')
    ax.bar(numerical_missing.index, numerical_missing, color='orange', label='Numerical')

    ax.set_xlabel('Column Name')
    ax.set_ylabel('Percentage of Missing Values')
    ax.set_title('Percentage of Missing Values in Categorical and Numerical Features')
    ax.legend()

    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, f'missingGraph.png')
    plt.savefig(output_path)
    plt.close()

In [None]:
def outlierBoxplots(dataset, output_folder = 'outlierBoxPlots'):
    fig, ax = plt.subplots(figsize=(12, 8))
    dataset.boxplot(column=list(dataset.select_dtypes(include=['number']).columns), ax=ax)
    ax.set_title('Boxplots for Numerical Features (Identifying Outliers)')
    plt.xticks(rotation=45, ha='right')

    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, f'outlierBoxPlots.png')
    plt.savefig(output_path)
    plt.close()

In [None]:
def distributionPlots(dataset, output_folder='plots'):
    numerical_features = dataset.select_dtypes(include=['number']).columns
    for feature in numerical_features:
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.histplot(data=dataset, x=feature, kde=True)
        ax.set_title(f'Distribution Plot - {feature}')
        plt.tight_layout()
        
        os.makedirs(output_folder, exist_ok=True)
        output_path = os.path.join(output_folder, f'{feature}_distribution_plot.png')
        plt.savefig(output_path)
        plt.close()

In [None]:
def preprocess_data(X):
    temp_X = []
    uniqueName = set()

    for i in range(X.shape[0]):
        if pd.notnull(X[i, 0]):
            current_value = X[i, 0]
            if current_value in uniqueName:
                continue
            else:
                uniqueName.add(current_value)

        if not (pd.notnull(X[i, 4]) and pd.notnull(X[i, -3]) and pd.notnull(X[i, -2])):
            continue

        if pd.notnull(X[i, 0]):
            X[i, 0] = ''.join([char for char in str(X[i, 0]) if ord(char) < 128])
            X[i, 0] = X[i, 0].strip()
        
        if pd.notnull(X[i, 1]):
            year_digits = ''.join(filter(str.isdigit, str(X[i, 1])))
            X[i, 1] = int(year_digits[:4]) if year_digits else np.nan
        
        if pd.notnull(X[i, 2]):
            X[i, 2] = ''.join([char for char in str(X[i, 2]) if ord(char) < 128])
            X[i, 2] = X[i, 2].strip()
        
        if pd.notnull(X[i, 4]):
            X[i, 4] = ''.join([char for char in str(X[i, 4]) if ord(char) < 128])
            X[i, 4] = X[i, 4].strip()
        
        if pd.notnull(X[i, -4]):
            X[i, -4] = ''.join([char for char in str(X[i, -4]) if ord(char) < 128])
            X[i, -4] = X[i, -4].strip()

            director_star_info = X[i, -4].split('|')
            directors = ""
            stars = ""
            for info in director_star_info:
                if 'Director' in info or 'Directors' in info:
                    directors = ', '.join([name.strip() for name in info.split(':')[1].split(',')])
                elif 'Stars' in info or 'Star' in info:
                    stars = ', '.join([name.strip() for name in info.split(':')[1].split(',')])
            
            X[i, 5] = directors
            X[i, 6] = stars

        
        if pd.notnull(X[i, -3]):
            X[i, -3] = int(''.join(filter(str.isdigit, str(X[i, -3]))))

        if pd.notnull(X[i, -2]):
            X[i, -2] = int(''.join(filter(str.isdigit, str(X[i, -2]))))
        
        temp_X.append(X[i, :].copy())

    X = np.array(temp_X)
    X = np.delete(X, -4, axis=1)
    num_columns = X.shape[1]
    print(num_columns)
    if num_columns < 10:
        X = np.hstack((X, np.full((X.shape[0], 10 - num_columns), None)))

    return X

In [None]:
def main():
    dataset = pd.read_csv('movies.csv')

    missingGraph(dataset)
    outlierBoxplots(dataset)
    distributionPlots(dataset)

    dataset.insert(5, 'Directors', None, allow_duplicates=True)
    dataset.insert(6, 'Stars', None, allow_duplicates=True)
    
    X = dataset.iloc[:, :].values
    X = preprocess_data(X)
    
    current_columns = list(dataset.columns)
    current_columns.remove(dataset.columns[-4])

    pre_processed_dataset = pd.DataFrame(X, columns=current_columns)
    pre_processed_dataset.to_csv('movies_pre_processed.csv', index=False)

In [None]:
if __name__ == '__main__':
    main()