<a href="https://colab.research.google.com/github/Mridullamurugan/Video-game-sales-prediction/blob/main/VGSP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn joblib




In [None]:
# Importing necessary Python libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score
    import joblib
except ImportError as e:
    print(f"Error importing libraries: {e}")
    print("Please ensure all required libraries are installed. Run: !pip install pandas numpy matplotlib seaborn scikit-learn joblib")
    exit()

# Read the dataset
try:
    data = pd.read_csv('/content/vgsales.csv')  # Colab-specific path
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'vgsales.csv' not found. Please upload the file to Colab:")
    print("1. Go to the Files tab in the left sidebar.")
    print("2. Click 'Upload' and select 'vgsales.csv'.")
    exit()
except Exception as e:
    print(f"Error reading dataset: {e}")
    exit()

# Displaying dataset information
print("\nFirst 5 rows of the dataset:")
print(data.head())
print("\nLast 5 rows of the dataset:")
print(data.tail())
print("\nDataset shape:", data.shape)
print("\nDataset info:")
print(data.info())

# Check for required columns
required_columns = ["Rank", "Genre", "Global_Sales", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Error: Missing required columns: {missing_columns}")
    exit()

# Dropping null values
print("\nNull values before dropping:")
print(data.isna().sum())
data = data.dropna()
print("\nFirst 10 rows after dropping nulls:")
print(data.head(10))
print("\nNull values after dropping:")
print(data.isna().sum())

# Visualizations
# Pie chart for top 10 game genres by count
try:
    game = data.groupby("Genre")["Global_Sales"].count().head(10)
    plt.figure(figsize=(7, 7))
    plt.pie(game, labels=game.index, colors=sns.color_palette("PuBu", len(game)), autopct='%1.1f%%')
    central_circle = plt.Circle((0, 0), 0.5, color='white')
    fig = plt.gcf()
    fig.gca().add_artist(central_circle)
    plt.rc('font', size=12)
    plt.title("Top 10 Categories of Games Sold", fontsize=20)
    plt.show()
except Exception as e:
    print(f"Error creating pie chart: {e}")

# Correlation heatmap
try:
    numeric_data = data.select_dtypes(include=[np.number])
    print("\nCorrelation matrix:")
    print(numeric_data.corr())
    plt.figure(figsize=(8, 6))
    sns.heatmap(numeric_data.corr(), cmap="YlOrBr", annot=True, fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()
except Exception as e:
    print(f"Error creating heatmap: {e}")

# Converting dependent and independent variables
try:
    x = data[["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]]
    y = data["Global_Sales"]

    # Ensure x contains only numeric data
    if not x.select_dtypes(include=[np.number]).columns.equals(x.columns):
        print("Error: Non-numeric data found in features. Please ensure all selected columns are numeric.")
        exit()
    print("\nFeature data types:")
    print(x.dtypes)
except Exception as e:
    print(f"Error preparing features: {e}")
    exit()

# Split data
try:
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
except Exception as e:
    print(f"Error splitting data: {e}")
    exit()

# Train Decision Tree model
try:
    dt_model = DecisionTreeRegressor()
    dt_model.fit(xtrain, ytrain)

    # Test Decision Tree model
    dt_predictions = dt_model.predict(xtest)
    dt_mse = mean_squared_error(ytest, dt_predictions)
    dt_r2 = r2_score(ytest, dt_predictions)

    print("\nDecision Tree MSE:", dt_mse)
    print("Decision Tree R^2:", dt_r2)
except Exception as e:
    print(f"Error with Decision Tree model: {e}")

# Train Linear Regression model
try:
    lr_model = LinearRegression()
    lr_model.fit(xtrain, ytrain)

    # Test Linear Regression model
    lr_predictions = lr_model.predict(xtest)
    lr_mse = mean_squared_error(ytest, lr_predictions)
    lr_r2 = r2_score(ytest, lr_predictions)

    print("\nLinear Regression MSE:", lr_mse)
    print("Linear Regression R^2:", lr_r2)
except Exception as e:
    print(f"Error with Linear Regression model: {e}")

# Finalizing and saving the models
try:
    joblib.dump(dt_model, '/content/final_decision_tree_model.pkl')
    joblib.dump(lr_model, '/content/final_linear_regression_model.pkl')
    print("\nModels saved as '/content/final_decision_tree_model.pkl' and '/content/final_linear_regression_model.pkl'")
    print("You can download these files from the Files tab in Colab.")
except Exception as e:
    print(f"Error saving models: {e}")

Error: 'vgsales.csv' not found. Please upload the file to Colab:
1. Go to the Files tab in the left sidebar.
2. Click 'Upload' and select 'vgsales.csv'.

First 5 rows of the dataset:


NameError: name 'data' is not defined

In [None]:
# Importing necessary Python libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score
    import joblib
except ImportError as e:
    print(f"Error importing libraries: {e}")
    print("Run: !pip install pandas numpy matplotlib seaborn scikit-learn joblib")
    exit()

# Enable inline plotting for Colab
%matplotlib inline

# Read the dataset from a reliable URL
url = 'https://raw.githubusercontent.com/gregorut/videogamesales/master/vgsales.csv'
try:
    data = pd.read_csv(url)
    print("Dataset loaded successfully from URL!")
except Exception as e:
    print(f"Error loading dataset from URL: {e}")
    print("Please upload 'vgsales.csv' manually to Colab:")
    print("1. Go to the Files tab in the left sidebar.")
    print("2. Click 'Upload' and select 'vgsales.csv'.")
    print("Then run the following code to load the file:")
    print("""
    from google.colab import files
    uploaded = files.upload()
    data = pd.read_csv('vgsales.csv')
    """)
    exit()

# Displaying dataset information
print("\nFirst 5 rows of the dataset:")
print(data.head())
print("\nLast 5 rows of the dataset:")
print(data.tail())
print("\nDataset shape:", data.shape)
print("\nDataset info:")
data.info()

# Check for required columns
required_columns = ["Rank", "Genre", "Global_Sales", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Error: Missing required columns: {missing_columns}")
    exit()

# Dropping null values
print("\nNull values before dropping:")
print(data.isna().sum())
data = data.dropna()
print("\nFirst 10 rows after dropping nulls:")
print(data.head(10))
print("\nNull values after dropping:")
print(data.isna().sum())

# Visualizations
# Pie chart for top 10 game genres by count
try:
    game = data.groupby("Genre")["Global_Sales"].count().head(10)
    plt.figure(figsize=(7, 7))
    plt.pie(game, labels=game.index, colors=sns.color_palette("PuBu", len(game)), autopct='%1.1f%%')
    central_circle = plt.Circle((0, 0), 0.5, color='white')
    fig = plt.gcf()
    fig.gca().add_artist(central_circle)
    plt.rc('font', size=12)
    plt.title("Top 10 Categories of Games Sold", fontsize=20)
    plt.show()
except Exception as e:
    print(f"Error creating pie chart: {e}")

# Correlation heatmap
try:
    numeric_data = data.select_dtypes(include=[np.number])
    print("\nCorrelation matrix:")
    print(numeric_data.corr())
    plt.figure(figsize=(8, 6))
    sns.heatmap(numeric_data.corr(), cmap="YlOrBr", annot=True, fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()
except Exception as e:
    print(f"Error creating heatmap: {e}")

# Converting dependent and independent variables
try:
    x = data[["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]]
    y = data["Global_Sales"]

    # Ensure x contains only numeric data
    if not x.select_dtypes(include=[np.number]).columns.equals(x.columns):
        print("Error: Non-numeric data found in features. Please ensure all selected columns are numeric.")
        exit()
    print("\nFeature data types:")
    print(x.dtypes)
except Exception as e:
    print(f"Error preparing features: {e}")
    exit()

# Split data
try:
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
except Exception as e:
    print(f"Error splitting data: {e}")
    exit()

# Train Decision Tree model
try:
    dt_model = DecisionTreeRegressor()
    dt_model.fit(xtrain, ytrain)

    # Test Decision Tree model
    dt_predictions = dt_model.predict(xtest)
    dt_mse = mean_squared_error(ytest, dt_predictions)
    dt_r2 = r2_score(ytest, dt_predictions)

    print("\nDecision Tree MSE:", dt_mse)
    print("Decision Tree R^2:", dt_r2)
except Exception as e:
    print(f"Error with Decision Tree model: {e}")

# Train Linear Regression model
try:
    lr_model = LinearRegression()
    lr_model.fit(xtrain, ytrain)

    # Test Linear Regression model
    lr_predictions = lr_model.predict(xtest)
    lr_mse = mean_squared_error(ytest, lr_predictions)
    lr_r2 = r2_score(ytest, lr_predictions)

    print("\nLinear Regression MSE:", lr_mse)
    print("Linear Regression R^2:", lr_r2)
except Exception as e:
    print(f"Error with Linear Regression model: {e}")

# Finalizing and saving the models
try:
    joblib.dump(dt_model, '/content/final_decision_tree_model.pkl')
    joblib.dump(lr_model, '/content/final_linear_regression_model.pkl')
    print("\nModels saved as '/content/final_decision_tree_model.pkl' and '/content/final_linear_regression_model.pkl'")
    print("You can download these files from the Files tab in Colab.")
except Exception as e:
    print(f"Error saving models: {e}")

Error loading dataset from URL: HTTP Error 404: Not Found
Please upload 'vgsales.csv' manually to Colab:
1. Go to the Files tab in the left sidebar.
2. Click 'Upload' and select 'vgsales.csv'.
Then run the following code to load the file:

    from google.colab import files
    uploaded = files.upload()
    data = pd.read_csv('vgsales.csv')
    

First 5 rows of the dataset:


NameError: name 'data' is not defined

In [None]:
# Importing necessary Python libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score
    import joblib
except ImportError as e:
    print(f"Error importing libraries: {e}")
    print("Run: !pip install pandas numpy matplotlib seaborn scikit-learn joblib")
    exit()

# Enable inline plotting for Colab
%matplotlib inline

# Upload the dataset in Colab
from google.colab import files
print("Please upload 'vgsales.csv' using the file upload dialog.")
uploaded = files.upload()

# Read the dataset
try:
    data = pd.read_csv('vgsales.csv')  # Colab stores uploaded files in /content/
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'vgsales.csv' not found. Please ensure you uploaded the file using the dialog above.")
    exit()
except Exception as e:
    print(f"Error reading dataset: {e}")
    exit()

# Displaying dataset information
print("\nFirst 5 rows of the dataset:")
print(data.head())
print("\nLast 5 rows of the dataset:")
print(data.tail())
print("\nDataset shape:", data.shape)
print("\nDataset info:")
print(data.info())

# Check for required columns
required_columns = ["Rank", "Genre", "Global_Sales", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Error: Missing required columns: {missing_columns}")
    exit()

# Dropping null values
print("\nNull values before dropping:")
print(data.isna().sum())
data = data.dropna()
print("\nFirst 10 rows after dropping nulls:")
print(data.head(10))
print("\nNull values after dropping:")
print(data.isna().sum())

# Visualizations
# Pie chart for top 10 game genres by count
try:
    game = data.groupby("Genre")["Global_Sales"].count().head(10)
    plt.figure(figsize=(7, 7))
    plt.pie(game, labels=game.index, colors=sns.color_palette("PuBu", len(game)), autopct='%1.1f%%')
    central_circle = plt.Circle((0, 0), 0.5, color='white')
    fig = plt.gcf()
    fig.gca().add_artist(central_circle)
    plt.rc('font', size=12)
    plt.title("Top 10 Categories of Games Sold", fontsize=20)
    plt.show()
except Exception as e:
    print(f"Error creating pie chart: {e}")

# Correlation heatmap
try:
    numeric_data = data.select_dtypes(include=[np.number])
    print("\nCorrelation matrix:")
    print(numeric_data.corr())
    plt.figure(figsize=(8, 6))
    sns.heatmap(numeric_data.corr(), cmap="YlOrBr", annot=True, fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()
except Exception as e:
    print(f"Error creating heatmap: {e}")

# Converting dependent and independent variables
try:
    x = data[["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]]
    y = data["Global_Sales"]

    # Ensure x contains only numeric data
    if not x.select_dtypes(include=[np.number]).columns.equals(x.columns):
        print("Error: Non-numeric data found in features. Please ensure all selected columns are numeric.")
        exit()
    print("\nFeature data types:")
    print(x.dtypes)
except Exception as e:
    print(f"Error preparing features: {e}")
    exit()

# Split data
try:
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
except Exception as e:
    print(f"Error splitting data: {e}")
    exit()

# Train Decision Tree model
try:
    dt_model = DecisionTreeRegressor()
    dt_model.fit(xtrain, ytrain)

    # Test Decision Tree model
    dt_predictions = dt_model.predict(xtest)
    dt_mse = mean_squared_error(ytest, dt_predictions)
    dt_r2 = r2_score(ytest, dt_predictions)

    print("\nDecision Tree MSE:", dt_mse)
    print("Decision Tree R^2:", dt_r2)
except Exception as e:
    print(f"Error with Decision Tree model: {e}")

# Train Linear Regression model
try:
    lr_model = LinearRegression()
    lr_model.fit(xtrain, ytrain)

    # Test Linear Regression model
    lr_predictions = lr_model.predict(xtest)
    lr_mse = mean_squared_error(ytest, lr_predictions)
    lr_r2 = r2_score(ytest, lr_predictions)

    print("\nLinear Regression MSE:", lr_mse)
    print("Linear Regression R^2:", lr_r2)
except Exception as e:
    print(f"Error with Linear Regression model: {e}")

# Finalizing and saving the models
try:
    joblib.dump(dt_model, '/content/final_decision_tree_model.pkl')
    joblib.dump(lr_model, '/content/final_linear_regression_model.pkl')
    print("\nModels saved as '/content/final_decision_tree_model.pkl' and '/content/final_linear_regression_model.pkl'")
    print("You can download these files from the Files tab in Colab.")
except Exception as e:
    print(f"Error saving models: {e}")

Please upload 'vgsales.csv' using the file upload dialog.


KeyboardInterrupt: 

In [None]:
# Importing necessary Python libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score
    import joblib
except ImportError as e:
    print(f"Error importing libraries: {e}")
    print("Run: !pip install pandas numpy matplotlib seaborn scikit-learn joblib")
    exit()

# Enable inline plotting for Colab
%matplotlib inline

# Attempt to load the dataset from a reliable URL
url = 'https://raw.githubusercontent.com/gregorut/videogamesales/master/vgsales.csv'  # Reliable source
try:
    data = pd.read_csv(url)
    print("Dataset loaded successfully from URL!")
except Exception as e:
    print(f"Error loading dataset from URL: {e}")
    print("Please upload 'vgsales.csv' manually to Colab:")
    print("1. Go to the Files tab in the left sidebar.")
    print("2. Click 'Upload' and select 'vgsales.csv'.")
    from google.colab import files
    uploaded = files.upload()
    try:
        data = pd.read_csv('vgsales.csv')
        print("Dataset loaded successfully from uploaded file!")
    except FileNotFoundError:
        print("Error: 'vgsales.csv' not found. Please ensure you uploaded the file.")
        exit()
    except Exception as e:
        print(f"Error reading uploaded dataset: {e}")
        exit()

# Displaying dataset information
print("\nFirst 5 rows of the dataset:")
print(data.head())
print("\nLast 5 rows of the dataset:")
print(data.tail())
print("\nDataset shape:", data.shape)
print("\nDataset info:")
print(data.info())

# Check for required columns
required_columns = ["Rank", "Genre", "Global_Sales", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Error: Missing required columns: {missing_columns}")
    exit()

# Dropping null values
print("\nNull values before dropping:")
print(data.isna().sum())
data = data.dropna()
print("\nFirst 10 rows after dropping nulls:")
print(data.head(10))
print("\nNull values after dropping:")
print(data.isna().sum())

# Visualizations
# Pie chart for top 10 game genres by count
try:
    game = data.groupby("Genre")["Global_Sales"].count().head(10)
    plt.figure(figsize=(7, 7))
    plt.pie(game, labels=game.index, colors=sns.color_palette("PuBu", len(game)), autopct='%1.1f%%')
    central_circle = plt.Circle((0, 0), 0.5, color='white')
    fig = plt.gcf()
    fig.gca().add_artist(central_circle)
    plt.rc('font', size=12)
    plt.title("Top 10 Categories of Games Sold", fontsize=20)
    plt.show()
except Exception as e:
    print(f"Error creating pie chart: {e}")

# Correlation heatmap
try:
    numeric_data = data.select_dtypes(include=[np.number])
    print("\nCorrelation matrix:")
    print(numeric_data.corr())
    plt.figure(figsize=(8, 6))
    sns.heatmap(numeric_data.corr(), cmap="YlOrBr", annot=True, fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()
except Exception as e:
    print(f"Error creating heatmap: {e}")

# Converting dependent and independent variables
try:
    x = data[["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]]
    y = data["Global_Sales"]

    # Ensure x contains only numeric data
    if not x.select_dtypes(include=[np.number]).columns.equals(x.columns):
        print("Error: Non-numeric data found in features. Please ensure all selected columns are numeric.")
        exit()
    print("\nFeature data types:")
    print(x.dtypes)
except Exception as e:
    print(f"Error preparing features: {e}")
    exit()

# Split data
try:
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
except Exception as e:
    print(f"Error splitting data: {e}")
    exit()

# Train Decision Tree model
try:
    dt_model = DecisionTreeRegressor()
    dt_model.fit(xtrain, ytrain)

    # Test Decision Tree model
    dt_predictions = dt_model.predict(xtest)
    dt_mse = mean_squared_error(ytest, dt_predictions)
    dt_r2 = r2_score(ytest, dt_predictions)

    print("\nDecision Tree MSE:", dt_mse)
    print("Decision Tree R^2:", dt_r2)
except Exception as e:
    print(f"Error with Decision Tree model: {e}")

# Train Linear Regression model
try:
    lr_model = LinearRegression()
    lr_model.fit(xtrain, ytrain)

    # Test Linear Regression model
    lr_predictions = lr_model.predict(xtest)
    lr_mse = mean_squared_error(ytest, lr_predictions)
    lr_r2 = r2_score(ytest, lr_predictions)

    print("\nLinear Regression MSE:", lr_mse)
    print("Linear Regression R^2:", lr_r2)
except Exception as e:
    print(f"Error with Linear Regression model: {e}")

# Finalizing and saving the models
try:
    joblib.dump(dt_model, '/content/final_decision_tree_model.pkl')
    joblib.dump(lr_model, '/content/final_linear_regression_model.pkl')
    print("\nModels saved as '/content/final_decision_tree_model.pkl' and '/content/final_linear_regression_model.pkl'")
    print("You can download these files from the Files tab in Colab.")
except Exception as e:
    print(f"Error saving models: {e}")

Error loading dataset from URL: HTTP Error 404: Not Found
Please upload 'vgsales.csv' manually to Colab:
1. Go to the Files tab in the left sidebar.
2. Click 'Upload' and select 'vgsales.csv'.


Saving marine_articles (1).csv to marine_articles (1) (1).csv
Error: 'vgsales.csv' not found. Please ensure you uploaded the file.

First 5 rows of the dataset:


NameError: name 'data' is not defined

In [None]:
# Importing necessary Python libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score
    import joblib
except ImportError as e:
    print(f"Error importing libraries: {e}")
    print("Run: !pip install pandas numpy matplotlib seaborn scikit-learn joblib")
    exit()

# Enable inline plotting for Colab
%matplotlib inline

# Read the dataset from the uploaded file
try:
    data = pd.read_csv('vgsales.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'vgsales.csv' not found. Please make sure you have uploaded the file to your Colab session.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")
    exit()


# Displaying dataset information
print("\nFirst 5 rows of the dataset:")
print(data.head())
print("\nLast 5 rows of the dataset:")
print(data.tail())
print("\nDataset shape:", data.shape)
print("\nDataset info:")
data.info()

# Check for required columns
required_columns = ["Rank", "Genre", "Global_Sales", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Error: Missing required columns: {missing_columns}")
    exit()

# Dropping null values
print("\nNull values before dropping:")
print(data.isna().sum())
data = data.dropna()
print("\nFirst 10 rows after dropping nulls:")
print(data.head(10))
print("\nNull values after dropping:")
print(data.isna().sum())

# Visualizations
# Pie chart for top 10 game genres by count
try:
    game = data.groupby("Genre")["Global_Sales"].count().head(10)
    plt.figure(figsize=(7, 7))
    plt.pie(game, labels=game.index, colors=sns.color_palette("PuBu", len(game)), autopct='%1.1f%%')
    central_circle = plt.Circle((0, 0), 0.5, color='white')
    fig = plt.gcf()
    fig.gca().add_artist(central_circle)
    plt.rc('font', size=12)
    plt.title("Top 10 Categories of Games Sold", fontsize=20)
    plt.show()
except Exception as e:
    print(f"Error creating pie chart: {e}")

# Correlation heatmap
try:
    numeric_data = data.select_dtypes(include=[np.number])
    print("\nCorrelation matrix:")
    print(numeric_data.corr())
    plt.figure(figsize=(8, 6))
    sns.heatmap(numeric_data.corr(), cmap="YlOrBr", annot=True, fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()
except Exception as e:
    print(f"Error creating heatmap: {e}")

# Converting dependent and independent variables
try:
    x = data[["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]]
    y = data["Global_Sales"]

    # Ensure x contains only numeric data
    if not x.select_dtypes(include=[np.number]).columns.equals(x.columns):
        print("Error: Non-numeric data found in features. Please ensure all selected columns are numeric.")
        exit()
    print("\nFeature data types:")
    print(x.dtypes)
except Exception as e:
    print(f"Error preparing features: {e}")
    exit()

# Split data
try:
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
except Exception as e:
    print(f"Error splitting data: {e}")
    exit()

# Train Decision Tree model
try:
    dt_model = DecisionTreeRegressor()
    dt_model.fit(xtrain, ytrain)

    # Test Decision Tree model
    dt_predictions = dt_model.predict(xtest)
    dt_mse = mean_squared_error(ytest, dt_predictions)
    dt_r2 = r2_score(ytest, dt_predictions)

    print("\nDecision Tree MSE:", dt_mse)
    print("Decision Tree R^2:", dt_r2)
except Exception as e:
    print(f"Error with Decision Tree model: {e}")

# Train Linear Regression model
try:
    lr_model = LinearRegression()
    lr_model.fit(xtrain, ytrain)

    # Test Linear Regression model
    lr_predictions = lr_model.predict(xtest)
    lr_mse = mean_squared_error(ytest, lr_predictions)
    lr_r2 = r2_score(ytest, lr_predictions)

    print("\nLinear Regression MSE:", lr_mse)
    print("Linear Regression R^2:", lr_r2)
except Exception as e:
    print(f"Error with Linear Regression model: {e}")

# Finalizing and saving the models
try:
    joblib.dump(dt_model, '/content/final_decision_tree_model.pkl')
    joblib.dump(lr_model, '/content/final_linear_regression_model.pkl')
    print("\nModels saved as '/content/final_decision_tree_model.pkl' and '/content/final_linear_regression_model.pkl'")
    print("You can download these files from the Files tab in Colab.")
except Exception as e:
    print(f"Error saving models: {e}")

Error: 'vgsales.csv' not found. Please make sure you have uploaded the file to your Colab session.

First 5 rows of the dataset:


NameError: name 'data' is not defined

In [None]:
# Importing necessary Python libraries
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error, r2_score
    import joblib
except ImportError as e:
    print(f"Error importing libraries: {e}")
    print("Run: !pip install pandas numpy matplotlib seaborn scikit-learn joblib")
    exit()

# Enable inline plotting for Colab
%matplotlib inline

# Read the dataset from the uploaded file
try:
    data = pd.read_csv('vgsales.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'vgsales.csv' not found. Please make sure you have uploaded the file to your Colab session.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")
    exit()


# Displaying dataset information
print("\nFirst 5 rows of the dataset:")
print(data.head())
print("\nLast 5 rows of the dataset:")
print(data.tail())
print("\nDataset shape:", data.shape)
print("\nDataset info:")
data.info()

# Check for required columns
required_columns = ["Rank", "Genre", "Global_Sales", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
    print(f"Error: Missing required columns: {missing_columns}")
    exit()

# Dropping null values
print("\nNull values before dropping:")
print(data.isna().sum())
data = data.dropna()
print("\nFirst 10 rows after dropping nulls:")
print(data.head(10))
print("\nNull values after dropping:")
print(data.isna().sum())

# Visualizations
# Pie chart for top 10 game genres by count
try:
    game = data.groupby("Genre")["Global_Sales"].count().head(10)
    plt.figure(figsize=(7, 7))
    plt.pie(game, labels=game.index, colors=sns.color_palette("PuBu", len(game)), autopct='%1.1f%%')
    central_circle = plt.Circle((0, 0), 0.5, color='white')
    fig = plt.gcf()
    fig.gca().add_artist(central_circle)
    plt.rc('font', size=12)
    plt.title("Top 10 Categories of Games Sold", fontsize=20)
    plt.show()
except Exception as e:
    print(f"Error creating pie chart: {e}")

# Correlation heatmap
try:
    numeric_data = data.select_dtypes(include=[np.number])
    print("\nCorrelation matrix:")
    print(numeric_data.corr())
    plt.figure(figsize=(8, 6))
    sns.heatmap(numeric_data.corr(), cmap="YlOrBr", annot=True, fmt=".2f")
    plt.title("Correlation Heatmap")
    plt.show()
except Exception as e:
    print(f"Error creating heatmap: {e}")

# Converting dependent and independent variables
try:
    x = data[["Rank", "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales"]]
    y = data["Global_Sales"]

    # Ensure x contains only numeric data
    if not x.select_dtypes(include=[np.number]).columns.equals(x.columns):
        print("Error: Non-numeric data found in features. Please ensure all selected columns are numeric.")
        exit()
    print("\nFeature data types:")
    print(x.dtypes)
except Exception as e:
    print(f"Error preparing features: {e}")
    exit()

# Split data
try:
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
except Exception as e:
    print(f"Error splitting data: {e}")
    exit()

# Train Decision Tree model
try:
    dt_model = DecisionTreeRegressor()
    dt_model.fit(xtrain, ytrain)

    # Test Decision Tree model
    dt_predictions = dt_model.predict(xtest)
    dt_mse = mean_squared_error(ytest, dt_predictions)
    dt_r2 = r2_score(ytest, dt_predictions)

    print("\nDecision Tree MSE:", dt_mse)
    print("Decision Tree R^2:", dt_r2)
except Exception as e:
    print(f"Error with Decision Tree model: {e}")

# Train Linear Regression model
try:
    lr_model = LinearRegression()
    lr_model.fit(xtrain, ytrain)

    # Test Linear Regression model
    lr_predictions = lr_model.predict(xtest)
    lr_mse = mean_squared_error(ytest, lr_predictions)
    lr_r2 = r2_score(ytest, lr_predictions)

    print("\nLinear Regression MSE:", lr_mse)
    print("Linear Regression R^2:", lr_r2)
except Exception as e:
    print(f"Error with Linear Regression model: {e}")

# Finalizing and saving the models
try:
    joblib.dump(dt_model, '/content/final_decision_tree_model.pkl')
    joblib.dump(lr_model, '/content/final_linear_regression_model.pkl')
    print("\nModels saved as '/content/final_decision_tree_model.pkl' and '/content/final_linear_regression_model.pkl'")
    print("You can download these files from the Files tab in Colab.")
except Exception as e:
    print(f"Error saving models: {e}")