# Clash of Clans - Explatory Data Analysis

Let's start with importing libraries and see what the data will take us! First, explore the data.

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go 
import plotly.figure_factory as ff


In [None]:
df = pd.read_csv('coc_clans_dataset.csv', index_col = 0)

### I try to recognize which columns I will use and which colums are unnecessary for me.

In [None]:
df.head()

In [None]:
df.info()

#### I like to use visualizations, pretty much for everything :) Let's see how many null data we have.


#### Data is clearly processed, we have only two columns with missing data.

## Let's explore one of those columns with empty cells, "clan_location".
## I want to see how many empty cells we have, after that explore its distribution.


In [None]:
missing_count = df['clan_location'].isnull().sum()
non_missing_count = len(df) - missing_count

labels = ['Missing Data', 'Non-Missing Data']
sizes = [missing_count, non_missing_count]
colors = ['lightcoral', 'lightgreen']

fig = px.pie(names=labels, values=sizes, color_discrete_sequence=colors)
fig.update_traces(textinfo='percent+label', pull=[0.1, 0])  # Explode the "Missing Data" slice
fig.update_layout(title='Missing Data in "clan_location" Column')

fig.show()

In [None]:
df['clan_location'].unique(), df['clan_location'].value_counts()

In [None]:
count = (df['clan_location'] == 'Türkiye').sum()
count

In [None]:
# Create a DataFrame with the top 20 clan locations
top_20_locations = df['clan_location'].value_counts().head(20).reset_index()
top_20_locations.columns = ['Clan Location', 'Count']

# Create a bar chart using Plotly Express
fig = px.bar(top_20_locations, x='Clan Location', y='Count', text='Count')

# Customize the layout
fig.update_layout(
    title='Count of Clan Locations',
    xaxis_title='Clan Location',
    yaxis_title='Count'
)

# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=45)

# Show the bar chart
fig.show()

### Indonesians seem to love Clash of Clans :)

## This was a simple exploration for a column. Let's dive deep into remaining of the data.

### Is Family Friendly? Let's find out.

In [None]:
# Create a DataFrame with the count of Family-Friendly vs. Not Family-Friendly Clans
family_friendly_counts = df['isFamilyFriendly'].value_counts().reset_index()
family_friendly_counts.columns = ['Is Family-Friendly', 'Count']

# Create a bar chart using Plotly Express
fig = px.bar(family_friendly_counts, x='Is Family-Friendly', y='Count', text='Count', color='Is Family-Friendly', labels={'Is Family-Friendly': 'Family-Friendly'})

# Customize the layout
fig.update_layout(
    title='Family-Friendly vs. Not Family-Friendly Clans',
    xaxis_title='Is Family-Friendly',
    yaxis_title='Count'
)

# Show the bar chart
fig.show()

### When we look into the clan_level column, we see that most clans are just level 1 and the frequency decreases sharply. So even with the bins, visualization is not very pleasant, but I'll keep it either way :)

In [None]:
df['clan_level'].value_counts()

In [None]:
# Group levels 1-6, 7-12, 13-18, and so on
bins = [0, 6, 12, 18, 24, 30, 36]
labels = ['1-6', '7-12', '13-18', '19-24', '25-30', '31-36']

# Group the clan levels and create a new column to represent the groups
df['clan_level_group'] = pd.cut(df['clan_level'], bins=bins, labels=labels)

# Create a DataFrame with the count of clans in grouped clan levels
clan_level_counts = df['clan_level_group'].value_counts().reset_index()
clan_level_counts.columns = ['Clan Level Group', 'Count']

# Create a bar chart using Plotly Express
fig = px.bar(clan_level_counts, x='Clan Level Group', y='Count', text='Count', color='Clan Level Group', labels={'Clan Level Group': 'Clan Level Group'})

# Customize the layout
fig.update_layout(
    title='Count of Clans in Grouped Clan Levels',
    xaxis_title='Clan Level Group',
    yaxis_title='Count'
)

# Show the bar chart
fig.show()

#### Let's show something obvious:

In [None]:
# Create a scatter plot using Plotly Express
fig = px.scatter(df, x='clan_level', y='war_wins', title='Correlation between Clan Level and War Wins',
                 labels={'clan_level': 'Clan Level', 'war_wins': 'War Wins'})

# Show the scatter plot
fig.show()

### This lmplot took some time to process, most probably because of the size of the data. 

## Distribution of Clan Leagues

In [None]:
df['capital_league'].value_counts()

In [None]:

# Create a DataFrame with the count of clans in each league
league_counts = df['capital_league'].value_counts().reset_index()
league_counts.columns = ['Capital League', 'Count']

# Create a bar chart using Plotly Express
fig = px.bar(league_counts, x='Capital League', y='Count', text='Count', color='Capital League', category_orders={"Capital League": df['capital_league'].value_counts().index})

# Customize the layout
fig.update_layout(
    title='Distribution of Clans in Leagues',
    xaxis_title='Capital League',
    yaxis_title='Count'
)

# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=45)

# Show the bar chart
fig.show()

### So many unranked, just exclude it for the sake of the graph.

In [None]:
# Filter the DataFrame to exclude "Unranked" leagues
df_ranked = df[df['capital_league'] != 'Unranked']

# Create a DataFrame with the count of clans in each ranked league
league_counts = df_ranked['capital_league'].value_counts().reset_index()
league_counts.columns = ['Capital League', 'Count']

# Create a bar chart using Plotly Express
fig = px.bar(league_counts, x='Capital League', y='Count', text='Count', color='Capital League', category_orders={"Capital League": df_ranked['capital_league'].value_counts().index})

# Customize the layout
fig.update_layout(
    title='Distribution of Clans in Ranked Leagues (Excluding "Unranked")',
    xaxis_title='Capital League',
    yaxis_title='Count'
)

# Rotate x-axis labels for better readability
fig.update_xaxes(tickangle=45)

# Show the bar chart
fig.show()

## Correlation Map

In [None]:
df.info()

In [None]:
selected_columns = ['clan_level', 'clan_points', 'clan_builder_base_points', 'clan_versus_points',
                    'required_trophies', 'war_win_streak', 'war_wins', 'war_ties', 'war_losses',
                    'num_members', 'required_builder_base_trophies', 'required_versus_trophies', 
                    'required_townhall_level', 'clan_capital_hall_level', 'clan_capital_points', 
                    'mean_member_level', 'mean_member_trophies']

correlation_matrix = df[selected_columns].corr()

# Create a correlation heatmap using Plotly
fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=selected_columns,
    y=selected_columns,
    colorscale='Viridis',
    showscale=True
)

# Customize the layout
fig.update_layout(
    title='Correlation Map of Selected Columns',
    xaxis_title='Columns',
    yaxis_title='Columns'
)

# Show the heatmap
fig.show()

## It would be interesting to run ML models in this dataset. We need to first identify target and variables. EDA is needed before "model step".