# Youtube streamer analysis
### Top youtube content creators

#### Description
This dataset contains valuable information about the top youtube streamers, including their ranking, categories, subscribers, country, visits, likes, comments, and more. The task is to perform a comprehensive analysis of the dataset to extract insights about the top youtube content creators.

### Load libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Load the dataset

In [None]:
df = pd.read_csv('youtubers_df.csv')

### 1. Exploring the dataset

In [None]:
# view the first 5 rows of the dataset
df.head()

In [None]:
# Concise summary of the dataframe
df.info()

In [None]:
# Check how many rows and columns the dataset contains
df.shape

In [None]:
# Descriptive statistics of the numerical columns of the dataframe
df.describe()

##### Identifying key variables

In [None]:
for category in df['Categories'].unique():
    print(category)

In [None]:
for country in df['Country'].unique():
    print(country)

##### Checking for missing values

In [None]:
# Check for the sum of any missing values in the columns
df.isnull().sum()

##### data cleaning

In [None]:
# Rename the 'suscribers' column to 'subscribers'
df.rename(columns={'Suscribers': 'Subscribers'}, inplace=True)

##### Visualizing the visits and subscribers columns and checking for any outliers

In [None]:
# Scatter plot for 'Subscribers'
plt.scatter(df['Subscribers'], df['Visits'])
plt.xlabel('Subscribers')
plt.ylabel('Visits')
plt.title('Subscribers vs Visits')
plt.show()

In [None]:
# Box plot for 'Subscribers'
plt.boxplot(df['Subscribers'])
plt.ylabel('Subscribers')
plt.title('Box Plot for Subscribers')
plt.show()

In [None]:
# Box plot for 'Visits'
plt.boxplot(df['Visits'])
plt.ylabel('Visits')
plt.title('Box Plot for Visits')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.boxplot(df['Subscribers'])
plt.ylabel('Subscribers')
plt.title('Box Plot of Subscribers')
plt.show()

In [None]:
# Using the IQR to explore outliers
print('Using the IQR to find outliers')
def find_outliers_IQR(df):
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    IQR = q3 - q1
    outliers = df[((df < (q1 - 1.5 * IQR)) | (df > (q3 + 1.5 * IQR)))]
    return outliers


outliers = find_outliers_IQR(df[["Subscribers", "Visits", "Likes", "Comments"]])
print("Number of Outliers: " + str(len(outliers)))
print("Max Outlier value: " + str(outliers.max()))
print("Min Outlier value: " + str(outliers.min()))
print(outliers)

def impute_outliers_IQR(df):
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    IQR = q3 - q1
    upper = q3 + 1.5 * IQR
    lower = q1 - 1.5 * IQR
    df = np.where((df > upper) | (df < lower), df.median(), df)
    return df

### 2. Trend analysis

In [None]:
# Count the occurrences of each category
print('Trends among the top youtube streamers')
df["Categories"].value_counts()

In [None]:
top_streamers = df.sort_values(by='Subscribers', ascending=False)
top_streamers.head()

In [None]:
# Trend Analysis: Popular Categories
plt.figure(figsize=(12, 6))
sns.countplot(x='Categories', data=df, order=df['Categories'].value_counts().index)
plt.xticks(rotation=45, ha='right')
plt.title('Distribution of top Streamers by Category')
plt.show()

In [None]:
# Performance Metrics
average_metrics = top_streamers[['Subscribers', 'Visits', 'Likes', 'Comments']].mean()
average_metrics.plot(kind='bar', figsize=(10, 6), rot=0)
plt.title('Top Streamers Average Performance Metrics')
plt.xlabel('Metrics')
plt.ylabel('Mean_Value')
plt.show()

In [None]:
# Correlation Analysis: Subscribers vs Likes visualization
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Subscribers', y='Likes', data=df)
plt.title('Correlation between Subscribers and Likes')
plt.xlabel('Subscribers')
plt.ylabel('Likes')
plt.show()

In [None]:
# Correlation Analysis: Subscribers vs Comments visualization
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Subscribers', y='Comments', data=df)
plt.title('Correlation between Subscribers and Comments')
plt.xlabel('Subscribers')
plt.ylabel('Comments')
plt.show()

In [None]:
# Correlation between the number of sucscribers and the number of likes and comments
correl = df['Subscribers'].corr(df['Likes'])
print("Correlation between Number of Subscribers and Likes:\n", correl)

correl = df['Subscribers'].corr(df['Comments'])
print("Correlation between Number of Subscribers and Comments:\n", correl)

# Correlation matrix
correlation_matrix = df[['Subscribers', 'Visits', 'Likes', 'Comments']].corr()

# Visualize the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title('Correlation matrix')
plt.show()

### 3. Audience study

In [None]:
# Count the occurrences of each country
country_counts = df['Country'].value_counts()

# Group by Country and calculate mean
country_stats = df.groupby('Country')['Subscribers'].mean()

# Display the statistics for audience distribution by country
print(country_stats)

In [None]:
# Distribution of Audiences by Country visualization
plt.figure(figsize=(12, 6))
sns.countplot(x='Country', data=df, order=df['Country'].value_counts().index)
plt.xticks(rotation=45, ha='right')
plt.title('Distribution of Audiences by Country')
plt.show()

region_category_stats = df.groupby(['Country', 'Categories'])['Subscribers'].mean().unstack()
# Display the statistics for audience distribution by country and category
print(region_category_stats)

# Regional Preferences for Content Categories
plt.figure(figsize=(14, 8))
sns.heatmap(pd.crosstab(df['Categories'], df['Country'], normalize='index'), cmap='viridis', annot=True, fmt=".2f", linewidths=.5)
plt.title('Regional Preferences for Content Categories')
plt.xlabel('Country')
plt.ylabel('Content Category')
plt.show()

### 4. Performance metrics

In [None]:
# Calculate the average metrics
average_metrics = df[['Subscribers', 'Visits', 'Likes', 'Comments']].mean()

In [None]:
# Display average metrics
print("Average Subscribers:", average_metrics['Subscribers'])
print("Average Visits:", average_metrics['Visits'])
print("Average Likes:", average_metrics['Likes'])
print("Average Comments:", average_metrics['Comments'])

In [None]:
# Average metrics visualization
plt.figure(figsize=(10, 6))
sns.barplot(x=average_metrics.index, y=average_metrics.values, palette='Set2')
plt.title('Average Performance Metrics')
plt.ylabel('Average Count')
plt.show()

### 5. Content categories

In [None]:
# Explore the distribution of content categories
category_distribution = df['Categories'].value_counts()

In [None]:
# Number of streamers per category
print("Number of streamers per category:")
print(category_distribution)

In [None]:
# Visualize the distribution of content categories
plt.figure(figsize=(12, 6))
sns.countplot(x='Categories', data=df, order=df['Categories'].value_counts().index, palette='viridis')
plt.title('Distribution of Content Categories')
plt.xlabel('Categories')
plt.ylabel('Number of Streamers')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Explore performance metrics across content categories
performance_metrics = ['Subscribers', 'Visits', 'Likes', 'Comments']

plt.figure(figsize=(16, 8))

for metric in performance_metrics:
    plt.subplot(2, 2, performance_metrics.index(metric) + 1)
    sns.barplot(x='Categories', y=metric, data=df, ci=None, palette='viridis')
    plt.title(f'Average {metric} by Category')
    plt.xlabel('Categories')
    plt.ylabel(f'Average {metric}')
    plt.xticks(rotation=45, ha='right')

plt.tight_layout(pad=2)
plt.show()

In [None]:
for metric in performance_metrics:
    # Calculate average metric per category
    avg_metric_per_category = df.groupby('Categories')[metric].mean()

    threshold_value = 20000

    # Identify categories with exceptional performance (e.g., above a certain threshold)
    exceptional_categories = avg_metric_per_category[avg_metric_per_category > threshold_value]

    # Display the results
    print(f"\nCategories with exceptional {metric} performance:")
    print(exceptional_categories.sort_values(ascending=False))

### 6. Brands and Collaborations:

In [None]:
# 6 - Brands and Collaborations
df_cleaned = df.dropna(subset=['Visits', 'Subscribers'])
print(df_cleaned[['Visits', 'Subscribers']].dtypes)

df_cleaned['Visits'] = pd.to_numeric(df_cleaned['Visits'], errors='coerce')
df_cleaned['Subscribers'] = pd.to_numeric(df_cleaned['Subscribers'], errors='coerce')

In [None]:
# Scatter plot of performance metrics vs. brand collaborations
plt.figure(figsize=(12, 8))
plt.scatter(df_cleaned['Visits'], df_cleaned['Subscribers'], c='darkblue', alpha=0.5)
plt.title('Performance Metrics vs. Brand Collaborations')
plt.xlabel('Visits')
plt.ylabel('Brand Collaborations')
plt.show()

In [None]:
# Calculate correlation coefficients
correlation_visits_links = df_cleaned['Visits'].corr(df_cleaned['Subscribers'])
print(f"Correlation between Visits and Brand Collaborations: {correlation_visits_links}")

### 7. Benchmarking 

In [None]:
# Average values for each performance metric
average_subscribers = df['Subscribers'].mean()
average_visits = df['Visits'].mean()
average_likes = df['Likes'].mean()
average_comments = df['Comments'].mean()

In [None]:
# Top-performing content creators
top_performers = df[
    (df['Subscribers'] > average_subscribers) &
    (df['Visits'] > average_visits) &
    (df['Likes'] > average_likes) &
    (df['Comments'] > average_comments)]

In [None]:
# Top-performing content creators
print("Top-performing content creators:")
print(top_performers[['Rank', 'Username', 'Categories', 'Subscribers', 'Visits', 'Likes', 'Comments', 'Links']])