In [1]:
import sys
sys.path.append('..')

import os
import pandas as pd
import matplotlib.pyplot as plt
from src.data_analysis import perform_t_test, create_pivot_table

# Try to load the manipulated data, if it doesn't exist, load the cleaned data
try:
    df = pd.read_csv('../data/processed/manipulated_dataset.csv')
    print("Loaded manipulated dataset.")
except FileNotFoundError:
    try:
        df = pd.read_csv('../data/processed/cleaned_dataset.csv')
        print("Manipulated dataset not found. Loaded cleaned dataset instead.")
    except FileNotFoundError:
        print("Neither manipulated nor cleaned dataset found. Please make sure you've run the data cleaning and manipulation steps.")
        sys.exit(1)

# Display information about the dataset
print(df.info())
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns)

# Identify numeric columns
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
print("\nNumeric columns:")
print(numeric_columns)

# Perform t-test on a numeric column (adjust as needed)
if 'streams' in df.columns and 'artist_name' in df.columns:
    t_stat, p_value = perform_t_test(df, 'artist_name', 'streams', df['artist_name'].unique()[0], df['artist_name'].unique()[1])
    print(f"\nT-test results for 'streams' between first two artists: T-statistic = {t_stat}, p-value = {p_value}")
else:
    print("\nCouldn't perform t-test: 'streams' or 'artist_name' column not found.")

# Create pivot table (adjust column names as needed)
if 'streams' in df.columns and 'artist_name' in df.columns and 'track_name' in df.columns:
    pivot_table = create_pivot_table(df, values='streams', index='artist_name', columns='track_name')
    print("\nPivot table (first 5 rows and columns):")
    print(pivot_table.iloc[:5, :5])
else:
    print("\nCouldn't create pivot table: required columns not found.")

# Create final visualizations
if 'streams' in df.columns and 'artist_name' in df.columns:
    plt.figure(figsize=(12, 6))
    df.groupby('artist_name')['streams'].mean().sort_values(ascending=False).head(10).plot(kind='bar')
    plt.title('Top 10 Artists by Average Streams')
    plt.xlabel('Artist')
    plt.ylabel('Average Streams')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    # Ensure the results directory exists
    os.makedirs('../results/figures', exist_ok=True)

    # Save the figure
    plt.savefig('../results/figures/top_10_artists_by_streams.png')
    plt.close()

    print("\nFinal visualization completed. Check the 'results/figures' directory for the final plot.")
else:
    print("\nCouldn't create visualization: 'streams' or 'artist_name' column not found.")

print("\nFinal analysis completed.")

Manipulated dataset not found. Loaded cleaned dataset instead.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 n

KeyError: 'artist_name'