In [None]:
# IMPORTS AND DEFINITIONS ONLY

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

acronym_to_name = {
    "bpm": "Beats per Minute (bpm)",
    "nrgy": "Energy (ngry)",
    "dnce": "Danceability (dnce)",
    "val": "Mood (val)",
    "pop": "Popularity (pop)"
}


def analyze_feature_individually(feature):
    feature_mean = feature.mean()
    feature_variance = feature.var()
    
    print(f"Analyzed Feature: {acronym_to_name.get(feature.name)}")
    print(f"Mean Value: {feature_mean:.2f}")
    print(f"Variance: {feature_variance:.2f}\n")

def analyze_features_jointly(feature_one, feature_two):
    pearsons_correlation = feature_one.corr(feature_two, method='pearson')
    covariance = feature_one.cov(feature_two)

    print(f"Covariance: {covariance:.2f}")
    print(f"Pearson's Correlation: {pearsons_correlation:.2f}")
    
def create_figure(feature_one, feature_two):
    title = f"{acronym_to_name.get(feature_one.name)} x {acronym_to_name.get(feature_two.name)}"
    figure = plt.figure()
    ax = figure.add_subplot(111)
    
    color = np.abs(feature) + np.abs(feature_two)
    ax.scatter(feature_one, feature_two, c=color, cmap='plasma')
    ax.set_xlabel(acronym_to_name.get(feature_one.name))
    ax.set_ylabel(acronym_to_name.get(feature_two.name))
    ax.set_title(title)
    
    return figure
    

def create_figure_with_regression_line(feature_one, feature_two, w0_star, w1_star):
    figure = create_figure(feature, pop)
    ax = figure.get_axes()[0]
    ax.plot(feature, w0_star + w1_star * feature, color='red', label='Regression Line')
    ax.legend()
    
    return figure

    
# Load the data
PATH_TO_DATA="/Users/germaingirndt/source/machine_learning/prog_exercise_01/task_01/kaggle/Spotify 2010 - 2019 Top 100.csv"
data = pd.read_csv(PATH_TO_DATA)












In [None]:
# Setting variables used by the following snippets
feature_names = ["pop", "top genre", "bpm", "nrgy", "dnce", "val"]

features = [ data.get(n) for n in feature_names]
pop = features[0]
genre = features[1]
other_features = features[2:]




In [None]:
# Taking a look at the data

df_cleaned = data.copy()[feature_names]
df_cleaned.dropna(inplace=True)

df_cleaned.describe(include="all").round(2)

# Note: The NaNs are not representative;
# They're simply caused by trying to output numeric statical coefficients for non-numeric values and vice-versa

In [None]:
# 1.1a - Create scatter plots and compute Pearson's correlation coefficients
for feature in other_features:
    create_figure(feature, pop)

    plt.show()
        
    analyze_feature_individually(pop)
    analyze_feature_individually(feature)
    analyze_features_jointly(pop, feature)
    
    
    print("\n" + "-" * 100)

In [None]:
# 1.1b - Ordinary Least Squares:
# Determine optimal coeficients, add a regression line to the plot, compute mean squared error (MSE)

for feature in other_features:
    # Defining X and Y
    Y = pop
    X = feature


    # Droping rows where either Y or X have NaNs
    df = pd.DataFrame({Y.name: Y, X.name: X}).dropna(how='any')

    filtered_Y = df.get(Y.name)
    filtered_X = df.get(X.name)

    # Making sure that both variables have the same size
    treated_Y = filtered_Y[0:len(filtered_X)]
    # Converting 1d matrix into a 2d matrix (required by the model.fit() method)
    treated_X = filtered_X.values.reshape(-1, 1)


    # Creating a linear regression model and fitting it to the data
    model = LinearRegression()
    model.fit(treated_X, treated_Y)
    
    w0_star = model.intercept_
    w1_star = model.coef_[0]


    # Calculating the mean squared error (MSE)
    mse = sum((model.predict(treated_X) - treated_Y)**2) / len(treated_Y) 
    
    
    create_figure_with_regression_line(feature, pop, w0_star, w1_star)
    plt.show()

    print(f"Feature: {acronym_to_name.get(X.name)}")
    print(f"w0*: {w0_star}")
    print(f"w1*: {w1_star}")
    print(f"Mean Squared Error: {mse}")
    print("\n" + "-"*50)





In [None]:
# Draft 1

import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Generate fake data
np.random.seed(42)
num_points = 100

# Create a list of points
points = []
for _ in range(num_points):
    point = {
        'x': np.random.normal(0, 1),
        'y': np.random.normal(0, 1),
        'z': np.random.normal(0, 1)
    }
    points.append(point)

# Create scatterplot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

for point in points:
    ax.scatter(point.get("x"), point.get("y"), point.get("z"), color="red", marker="x")

# Set labels and title
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')




In [None]:
# Draft 2


# Defining X and Y
Y = pop
X = other_features[1]


# Droping rows where either Y or X have NaNs
df = pd.DataFrame({Y.name: Y, X.name: X}).dropna(how='any')

filtered_Y = df.get(Y.name)
filtered_X = df.get(X.name)

# Making sure that both variables have the same size
treated_Y = filtered_Y[0:len(filtered_X)]
treated_X = filtered_X


def calculate_w1_star(X, Y):
    n = len(X)
    sigma_x = X.sum()
    sigma_y = Y.sum()    
    sigma_x_multiplied_by_y = sum([Xi * Yi for Xi, Yi in zip(X, Y)])
    sigma_x_squared = sum([Xi ** 2 for Xi in X])
    

    w1_star = ((n * sigma_x_multiplied_by_y) - sigma_x * sigma_y) / (n * sigma_x_squared - sigma_x ** 2)
    return w1_star
    
def calculate_w0_star(X, Y, w1_star):
    w0_star = Y.mean() - w1_star * X.mean()
    return w0_star
    
def predict_y(x):
    w1_star = calculate_w1_star(treated_X, treated_Y)
    w0_star = calculate_w0_star(treated_X, treated_Y, w1_star)
    

    print(f"w0*: {w0_star}")
    print(f"w1*: {w1_star}")
    
    return w0_star + w1_star * x



predict_y(105)
