In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('bbTop100.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Create new DataFrame 
music_df = df.copy()

# Create a new column 'chart_position' 
music_df['chart_position'] = np.nan

# Iterate over unique years 
for year in music_df['year'].unique():
    # Get the indices 
    year_indices = music_df[music_df['year'] == year].index
    # Generate random positions 
    positions = np.random.choice(range(1, 101), size=len(year_indices), replace=False)
    # Update the 'chart_position' 
    music_df.loc[year_indices, 'chart_position'] = positions

# Convert the 'chart_position' 
music_df['chart_position'] = music_df['chart_position'].astype(int)


In [None]:
music_df.head()

In [None]:
# By Year

# Prepare data 
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
target = 'chart_position'

# Dictionary 
feature_importances_by_year = {}

# Iterate 
for year in music_df['year'].unique():
    # Filter 
    year_data = music_df[music_df['year'] == year]
    
    # Split data 
    X = year_data[features]
    y = year_data[target]
    
    # Training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Random Forest regressor
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict chart_position on test set
    y_pred = model.predict(X_test)
    
    # Mean squared error 
    mse = mean_squared_error(y_test, y_pred)
    print(f"Year: {year}, Mean Squared Error: {mse}")
    
    # Store feature importances 
    feature_importances_by_year[year] = model.feature_importances_

# Analyze feature importances by year
for year, importances in feature_importances_by_year.items():
    print(f"Year: {year}")
    for feature, importance in zip(features, importances):
        print(f"{feature}: {importance}")


In [None]:
# By Decades

# Prepare data 
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
target = 'chart_position'

# Group data 
music_df['decade'] = music_df['year'] // 10 * 10

# Dictionary
feature_importances_by_decade = {}

# Iterate 
for decade in music_df['decade'].unique():
    # Filter 
    decade_data = music_df[music_df['decade'] == decade]
    
    # Split 
    X = decade_data[features]
    y = decade_data[target]
    
    # Ttraining and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train Random Forest regressor
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict 
    y_pred = model.predict(X_test)
    
    # Mean squared error 
    mse = mean_squared_error(y_test, y_pred)
    print(f"Decade: {decade}s, Mean Squared Error: {mse}")
    
    # Feature importances current decade
    feature_importances_by_decade[decade] = model.feature_importances_

for decade, importances in feature_importances_by_decade.items():
    print(f"Decade: {decade}s")
    for feature, importance in zip(features, importances):
        print(f"{feature}: {importance}")


In [None]:

# Prepare data 
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
target = 'chart_position'

# Split 
X = music_df[features]
y = music_df[target]

# Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict 
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Overall, Mean Squared Error: {mse}")

importances = model.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})

# Sort 
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)


In [None]:
# Overall
feature_importance_df

In [None]:
# By Decade

# Prepare data 
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
target = 'chart_position'

# By decade
music_df['decade'] = music_df['year'] // 10 * 10

# Dictionary 
feature_importances_by_decade = {}

# Iterate 
for decade in music_df['decade'].unique():
    # Filter 
    decade_data = music_df[music_df['decade'] == decade]
    
    # Split 
    X = decade_data[features]
    y = decade_data[target]
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    # Features
    importances = model.feature_importances_
    
    feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
    
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    
    feature_importances_by_decade[decade] = feature_importance_df

for decade, importance_df in feature_importances_by_decade.items():
    print(f"Decade: {decade}s")
    print(importance_df)


In [None]:
print(importance_df)

In [None]:

# Iterate and plot feature importances
for decade, importance_df in feature_importances_by_decade.items():
    plt.figure(figsize=(10, 6))
    plt.bar(importance_df['Feature'], importance_df['Importance'])
    plt.title(f"Feature Importances for {decade}s")
    plt.xlabel("Feature")
    plt.ylabel("Importance")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


In [None]:

overall_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})

# Sort overall feature importances by importance value
overall_importance_df = overall_importance_df.sort_values(by='Importance', ascending=False)

# Plot 
plt.figure(figsize=(10, 6))
plt.bar(overall_importance_df['Feature'], overall_importance_df['Importance'], color='red')
plt.title("Overall Feature Importances")
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
