# Spotify-Youtube adatelemzés

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('spotify_yt.csv')
df.head()

In [None]:
print('Shape:', df.shape)
print('\nInfo:')
df.info()
print('\nLeíró statisztika (numerikus oszlopok):')
df.describe()

In [None]:
null_counts = df.isnull().sum()
null_counts

In [None]:
cols_fill = ['Loudness', 'Acousticness', 'Instrumentalness', 'Energy', 'Danceability']

df[cols_fill] = df.groupby('Artist')[cols_fill].transform(lambda x: x.fillna(x.mean()))

df[cols_fill] = df[cols_fill].fillna(df[cols_fill].mean())

df[cols_fill].isnull().sum()

In [None]:
cols_check = ['Speechiness', 'Liveness', 'Valence', 'Tempo', 'Key', 'Duration_ms']
null_rows = df[df[cols_check].isnull().any(axis=1)]
print('Null értékeket tartalmazó sorok száma:', len(null_rows))

print(null_rows.head())

df.dropna(subset=cols_check, inplace=True)

print('\nNull értékek az eldobás után:')
df[cols_check].isnull().sum()

In [None]:
print('Eredeti dtypes:')
print(df.dtypes)

int_cols = ['Duration_ms', 'Key', 'Views', 'Likes', 'Comments', 'Stream']
for c in int_cols:
    if c in df.columns:
        df[c] = df[c].fillna(-1)
        df[c] = df[c].astype(int)

bool_cols = ['Licensed', 'official_video']
for c in bool_cols:
    if c in df.columns:
        df[c] = df[c].fillna(False)
        df[c] = df[c].astype(bool)
print('\nÁtalakítás után dtypes:')
print(df.dtypes)

df[int_cols + bool_cols].head()

In [None]:
if 'Stream' in df.columns:
    before = len(df)
    df = df[df['Stream'] != -1].copy()
    after = len(df)
    print(f'Dropped {before-after} rows with Stream == -1')

for dropc in ['Url_spotify', 'Uri']:
    if dropc in df.columns:
        df.drop(columns=[dropc], inplace=True)
print('Columns now:', df.columns.tolist())

In [None]:
out_path = 'spotify_yt_cleaned.csv'
df.to_csv(out_path, index=False)
print('Saved cleaned data to', out_path)

df.head(3)

In [None]:
if 'Url_youtube' in df.columns:
    yt_count = df['Url_youtube'].notnull().sum()
else:
    yt_count = df['Title'].notnull().sum()
print('Tracks with YouTube video:', yt_count)
yt_count

In [None]:
top10_dance = df.groupby('Artist')['Danceability'].mean().sort_values(ascending=False).head(10)
print('Top 10 artists by average Danceability:')
top10_dance

In [None]:
top10_streamed = df.groupby('Artist')['Stream'].sum().sort_values(ascending=False).head(10)

top10_views = df.groupby('Artist')['Views'].sum().sort_values(ascending=False).head(10)
print('Top10 streamed (artists):')
print(top10_streamed)
print('\nTop10 by YouTube views:')
print(top10_views)

comp = pd.DataFrame({'Stream': top10_streamed})
comp['Views'] = [top10_views.get(a, 0) for a in comp.index]

plt.figure(figsize=(12,6))
x = np.arange(len(comp.index))
width = 0.4

plt.bar(x - width/2, comp['Stream'], width, label='Spotify Streams', color='blue')
plt.xlabel('Artist')
plt.ylabel('Streams (összeg)', color='blue')
plt.xticks(x, comp.index, rotation=45)

ax2 = plt.twinx()
ax2.bar(x + width/2, comp['Views'], width, label='YouTube Views', color='orange', alpha=0.6)
ax2.set_ylabel('YouTube Views (összeg)', color='orange')
plt.title('Top10 Spotify streamed artists (Streams vs YouTube Views)')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
comp

In [None]:
top10_tracks = df.sort_values('Stream', ascending=False).drop_duplicates(subset=['Track']).head(10)

num_cols = top10_tracks.select_dtypes(include=[np.number]).columns.tolist()
corr = top10_tracks[num_cols].corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation matrix for top 10 tracks (by Streams)')
plt.tight_layout()
plt.show()
corr

In [None]:
artists_of_interest = ['J. Cole', 'Kendrick Lamar', 'Drake']

subset = df[df['Artist'].isin(artists_of_interest)].copy()

subset = subset.drop_duplicates(subset=['Track'])

agg = subset.groupby('Artist').agg({'Stream':'sum', 'Views':'sum'})
agg = agg.reindex(artists_of_interest).fillna(0)
print(agg)

plt.figure(figsize=(14,6))
colors = ['#66c2a5', '#fc8d62', '#8da0cb']

plt.subplot(1,2,1)
plt.pie(agg['Stream'], labels=agg.index, autopct='%1.1f%%', colors=colors, startangle=140)
plt.title('Spotify Streams (összeg)')

plt.subplot(1,2,2)
plt.pie(agg['Views'], labels=agg.index, autopct='%1.1f%%', colors=colors, startangle=140)
plt.title('YouTube Views (összeg)')

plt.gcf().patch.set_facecolor('#f7f7f7')
plt.tight_layout()
plt.show()
agg