In [2]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np

# Load the cleaned data
df = pd.read_csv('../data/processed/cleaned_dataset.csv')

# Display information about the dataset
print(df.info())
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns)

# Check the data type of the 'streams' column
if 'streams' in df.columns:
    print("\nData type of 'streams' column:")
    print(df['streams'].dtype)

    # Convert 'streams' to numeric, replacing any non-numeric values with NaN
    df['streams'] = pd.to_numeric(df['streams'], errors='coerce')

    # Filter data
    filtered_df = df[df['streams'] > 1000000].dropna(subset=['streams'])  # Filter songs with more than 1 million streams
    print("\nFiltered data (first few rows):")
    print(filtered_df.head())
else:
    print("\n'streams' column not found in the dataset.")

# Group by and aggregate
# We'll use the first string column as a grouping key, and aggregate numeric columns
string_columns = df.select_dtypes(include=['object']).columns
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

if len(string_columns) > 0 and len(numeric_columns) > 0:
    group_column = string_columns[0]
    agg_dict = {col: ['mean', 'sum', 'max'] for col in numeric_columns}
    
    grouped_df = df.groupby(group_column).agg(agg_dict).reset_index()
    print(f"\nGrouped data by '{group_column}' (first few rows):")
    print(grouped_df.head())
else:
    print("\nUnable to perform grouping. No suitable columns found.")

# Create a new column
if 'in_spotify_playlists' in df.columns and 'streams' in df.columns:
    df['playlist_stream_ratio'] = df['in_spotify_playlists'] / df['streams']
    print("\nNew column 'playlist_stream_ratio' (first few rows):")
    print(df[[group_column, 'streams', 'in_spotify_playlists', 'playlist_stream_ratio']].head())
else:
    print("\nUnable to create 'playlist_stream_ratio'. Required columns not found.")

# Save the manipulated data
df.to_csv('../data/processed/manipulated_dataset.csv', index=False)
print("\nData manipulation completed. Manipulated data saved to '../data/processed/manipulated_dataset.csv'")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key                   858 non-null    ob

KeyError: 'artist_name'