In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [14]:
def scatterPlot(dataset, independent, dependent, output_folder='scatterPlots'):
    """
    Generate scatterplots for each independent variable against the dependent variable (AveragePrice).

    Parameters:
        - dataset (pd.DataFrame): The input dataset containing both independent and dependent variables.
        - output_folder (str): The folder where the scatterplots will be saved. Default is 'scatterPlots'.

    Returns:
        None
    """
    
    # Loop through each independent variable
    for variable in independent:
        # Create a new figure
        plt.figure(figsize=(8, 6))

        # Generate scatter plot
        sns.scatterplot(x=dataset[variable], y=dataset[dependent])

        # Set plot title and axis labels
        plt.title(f'Scatter Plot: {variable} vs Average Price')
        plt.xlabel(variable)
        plt.ylabel('Average Price')
        plt.tight_layout()

        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)

        # Save the scatterplot as an image
        output_path = os.path.join(output_folder, f'{variable}vsAveragePrice.png')
        plt.savefig(output_path)
        plt.close()

In [15]:
def correlationMatrix(dataset, independent, dependent, output_folder='correlation_matrix'):
    # Calculate the correlation matrix
    correlation_matrix = dataset.corr()
    # print(correlation_matrix)

    # Extract the correlation coefficients between independent and dependent variables
    correlation_coefficients = correlation_matrix.loc[independent, dependent]
    plt.figure(figsize=(50, 48))

    # Plot the correlation matrix heatmap for better visualization
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".3f")
    plt.title('Correlation Matrix')
    
    # Save the plot as a PNG image
    os.makedirs(output_folder, exist_ok=True)
    save_file_path = os.path.join(output_folder, 'correlation_matrix.png')
    plt.savefig(save_file_path)
    plt.close()


In [16]:
def linearRegressionSummary(dataset, independent, dependent, output_folder='linear_regression_summary'):
    """
    Fit a linear regression model and write the summary to a text file.

    Parameters:
        - dataset (pd.DataFrame): The input dataset containing both independent and dependent variables.
        - independent (list): List of independent variable names.
        - dependent (str): Name of the dependent variable.
        - output_folder (str): The folder where the summary will be saved. Default is 'linear_regression_summary'.

    Returns:
        None
    """
    # Add a constant to the independent variables matrix
    
    X = sm.add_constant(dataset[independent])
    
    # Fit the linear regression model
    model = sm.OLS(dataset[dependent], X).fit()

    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Write the regression summary to a text file
    output_path = os.path.join(output_folder, 'linear_regression_summary.txt')
    with open(output_path, 'w') as file:
        file.write('statsmodels.summary()\n')
        file.write(model.summary().as_text())

In [17]:
def linearRegressionSklearn(dataset, independent, dependent):
    """
    Fit a linear regression model using scikit-learn and print the model summary.

    Parameters:
        - dataset (pd.DataFrame): The input dataset containing both independent and dependent variables.
        - independent (list): List of independent variable names.
        - dependent (str): Name of the dependent variable.

    Returns:
        None
    """
    
    independent.remove('Date')
    
    # Split the dataset into features (X) and target variable (y)
    X = dataset[independent]
    y = dataset[dependent]
    linearRegressionSummary(dataset, independent, dependent, output_folder='linear_regression_summary1')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Create an instance of the LinearRegression model
    model = LinearRegression()

    # Fit the model to the data
    model.fit(X_train, y_train)

    # Make predictions on the entire dataset
    y_pred = model.predict(X_test)

    # Calculate and print model evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")


In [18]:
# Read the csv file as pandas dataset
dataset = pd.read_csv('avocado_pre_processed.csv')

# Extract the columns
columns = list(dataset.columns)
independent = columns[:-1]
dependent = columns[-1]
del(columns)


# Plot the scatter plot
scatterPlot(dataset, independent, dependent)

# Interpretting the strength of the relatinship via correlation matrix
correlationMatrix(dataset, independent, dependent)

# Assess the strength of the relationship using statsmodels.summary()
# linearRegressionSummary(dataset, independent, dependent)

# Perform linear regression using scikit-learn
linearRegressionSklearn(dataset, independent, dependent)

Mean Squared Error: 0.07602565666937873
R-squared: 0.5332601098096452
