<a href="https://colab.research.google.com/github/Karlajack/Python_classes/blob/main/CPU_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'cpu-and-gpu-product-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2568515%2F4367503%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240429%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240429T103631Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da13075a54b220eedd6229ddb0f2f37c26975dec4bb8a0a8ee4b38ae5dde4de2e5f59f151b91a4634896f895eaf6bcd73c590adf887988e1ac18a579bfb4c526288a536ddc5ea55d1a1babe4bb2f9f70c8cb051a2476fe63545cc6c1519043c2b85b06eaffd0ae1c4939eba2b5b5daffd8fbcbf233a55c8559595e6b1950fdbfea8808f8644b2e6dddd10f920b8eb447931651e28e07bac5630f766343b500e01c5c3b8a7953fbfed326babb67d98146fb43c3211742c1f98782e66b6bcb7890a27d52577880cee33baf06441805f1215a71c2bf43fa737c21a0b8c59a03df6066973cde626ffe141c906a50cb14d25d68cf5c6507cf11cb224ab59e7fade5a53'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# **\[0\]** Importing Libraries & Reading Data

In [None]:
import re

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
%matplotlib inline

plt.style.use('ggplot')

plt.rcParams['axes.titleweight']   = 'bold'
plt.rcParams['axes.titlelocation'] = 'left'
plt.rcParams['figure.titleweight'] = 'bold'
plt.rcParams['legend.framealpha'] = 0.5
plt.rcParams['legend.facecolor'] = 'white'
plt.rcParams['axes.facecolor'] = '#DDDDFF'
plt.rcParams['scatter.edgecolors'] = '#DDDDFF'


pd.options.display.max_columns = 200

In [None]:
df = pd.read_csv("/kaggle/input/cpu-and-gpu-product-data/chip_dataset.csv", index_col=0)

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# Number of missing values for each columns
df.isna().sum()

# **\[1\]** Data Preprocessing
1. Remove Unnecessary data
2. Rename columns

In [None]:
# remove columns
df.columns
df = df.loc[:, [
    'Product', 'Type', 'Release Date', 'Process Size (nm)', 'TDP (W)',
    'Die Size (mm^2)', 'Transistors (million)', 'Freq (MHz)', 'Foundry',
    'Vendor' #, 'FP16 GFLOPS', 'FP32 GFLOPS', 'FP64 GFLOPS'
]] \
.reset_index(drop=True)

df.info()

In [None]:
df = df.rename(columns={
    'Product': 'model',
    'Release Date': 'release_date',
    'TDP (W)': 'tdp_w',
    'Die Size (mm^2)': 'die_size_mm2',
    'Process Size (nm)': 'process_size_nm',
    'Transistors (million)': 'transistors_10e6',
    'Freq (MHz)': 'freq_mhz',
    'Vendor': 'Vendor',
    'Foundry': 'Foundry'
})
df.head()

In [None]:
# set release_date to datetime64
df.release_date = pd.to_datetime(df.release_date,format="%Y-%m-%d")
df = df.dropna(subset=['release_date'])

# Fill NaN values with mean
df.process_size_nm = df.process_size_nm.fillna(df.process_size_nm.dropna().mean())
df.die_size_mm2 = df.die_size_mm2.fillna(df.die_size_mm2.dropna().mean())
df.transistors_10e6 = df.transistors_10e6.fillna(df.transistors_10e6.dropna().mean())

# set procss size to int
df.process_size_nm = df.process_size_nm.astype(int)

df['rel_year'] = df.release_date.dt.year.astype(int, errors='raise')

df.info()

In [None]:
df = df.query("rel_year < 2021")
df.info()

In [None]:
df_use_cats = ["Server", "PC"]

df['series'] = df.model.str.extract(r"^\w+\s(\w+)\s*")
tgt = df.model.str.contains('EPYC|Xeon|Quadro', regex=True, flags=re.IGNORECASE, na=False) \
     & df.Type.str.contains('CPU')

df['target'] = pd.Series(['Server' if i else 'PC' for i in tgt])

df.head()

In [None]:
# Convert to categorical
categories = {
    'Vendor': ['AMD','Intel','Other','NVIDIA','ATI'],
    'Type': ['CPU','GPU'],
    'target': ['Server','PC']
}
for k,v in categories.items():
    df[k] = pd.Categorical(df[k],categories=v,ordered=True)

df.info()

In [None]:
fig, axs = plt.subplots(1,4,figsize=(12,6),sharey=True)

with sns.axes_style('ticks'):
    sns.countplot(df, x='Type', ax=axs[0])
    axs[0].set_xticklabels(axs[0].get_xticklabels(), rotation=90)
    sns.countplot(df, x='target', ax=axs[1])
    sns.countplot(df, x='Vendor', ax=axs[2])

#     df_fd = df.groupby('Foundry',as_index=False)['model'].agg(['count'])
#     display(df_fd)
    sns.countplot(x=df.Foundry, ax=axs[3])

    for i in (1,2,3):
        axs[i].set_ylabel('')
        axs[i].tick_params(left=False)
        axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=90)
#         sns.despine(ax=axs[i],left=True)
    sns.despine(trim=False)

# [3] Questions

## 1. Does Moore's Law hold?
> Moore's Law is the observation that the number of transistors in an Integrated Circuit **doubles** about every *2 years*


For the dataset in use, we have *2* columns related to Moore's law:
1. **Process Size (nm)** - transistor size on each layer of silicon on CPU
2. **Transistors (million)** - number of transistors in millions
3. **Die Size ($mm^2$)** - Size of IC

For Moore's Law to be upheld, there must be:
1. ***Negative* correlation** between <u>Process Size</u> and <u>Time</u>
2. ***Positive* correlation** between <u>Transistors</u> and <u>Time</u>

In [None]:
# Data aggregation
q1_cols = ['model','Type','transistors_10e6','rel_year','process_size_nm','die_size_mm2','Vendor','target']
q1_data = df.loc[:, q1_cols].copy()

q1_data['transistors'] = q1_data.transistors_10e6 * 1_000_000
q1_data['transistors_log'] = np.log10(q1_data.transistors)

In [None]:
sns.displot(df.transistors_10e6,log_scale=True,kind='hist',kde=True)

**Histogram of Transistor Count over Time** \
The plotted histogram shows us a general *increase* in Transistor Count over the years

In [None]:
x_ax = 'rel_year'
marker_ls=['o','X']
x_label = 'Release Year $\\rightarrow$'

brand_palette = sns.color_palette(palette=['#ED1C24','#0f7dc2','xkcd:bluish grey','#76b900','#ED1B2F'],
                                 n_colors=5)

### Plots between *Transistor Count* and *Year of Release*
1. Regression Plotting
2. Bar Plot

In [None]:
#===================================#
# Transistor Count vs. Release Year #
#===================================#

fig, ax = plt.subplots(figsize=(12,6))

y_ax = 'transistors_log'

sns.scatterplot(data=q1_data,
            y=y_ax, x=x_ax,
            markers=marker_ls, legend=True,
            palette=brand_palette,
            alpha=.9, hue='Vendor', style='Type',
            ax=ax)


sns.regplot(data=q1_data,
            y=y_ax, x=x_ax, ci=95, scatter=False,
            color='xkcd:royal purple', label='Regression Line',
            ax=ax)

ax.set_xlabel(x_label)
ax.set_ylabel(r"Transistor Count$(log_{10}) \longrightarrow$")

ax.set_title(r"Transistor Count$(log_{10})$ vs. Release Year")

ax.legend(ncols=2)

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

y_ax = 'transistors_log'

sns.barplot(data=q1_data,
            y=y_ax, x=x_ax, dodge=True,
            width=.5,errorbar=None,
            ax=ax)

ax.set_xlabel(x_label)
ax.set_ylabel(r"Transistor Count$(log_{10}) \longrightarrow$")

ax.set_title(r"Transistor Count$(log_{10})$ vs. Release Year")

ax.legend()

plt.show()

### Relational Plot between *Process Size* and *Release Year*

In [None]:
#===============================#
# Process Size vs. Release Year #
#===============================#

fig, ax = plt.subplots(figsize=(10,5))

y_ax = 'process_size_nm'

sns.scatterplot(data=q1_data,
            y=y_ax, x=x_ax,
            markers=marker_ls, legend=True,
            palette=brand_palette,
            alpha=.9, hue='Vendor', style='Type',
            ax=ax)

sns.regplot(data=q1_data,
            y=y_ax, x=x_ax, ci=95, scatter=False,
            color='xkcd:pine green', label='Regression Line',
            ax=ax)

ax.set_xlabel(x_label)
ax.set_ylabel(r"Process Size$(nm) \longrightarrow$")
ax.set_title(r"Process Size$(nm)$ vs. Release Year")

ax.legend(ncols=2)

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

y_ax = 'process_size_nm'

sns.barplot(data=q1_data,
            y=y_ax, x=x_ax, dodge=True,
            width=.5, errorbar=None,
            ax=ax)

ax.set_xlabel(x_label)
ax.set_ylabel(r"Process Size$(nm) \longrightarrow$")

ax.set_title(r"Process Size$(nm)$ vs. Release Year")

ax.legend()

plt.show()

### Relational Plot between *Die Size* and *Release Year*

In [None]:
#===========================#
# Process Size vs. Die Size #
#===========================#

fig, ax = plt.subplots(figsize=(10,5))

y_ax = 'die_size_mm2'

sns.scatterplot(data=q1_data,
            y=y_ax, x='process_size_nm',
            markers=marker_ls, legend='brief',
            palette=brand_palette,
            alpha=.6, hue='Vendor', style='Type',
            ax=ax)

sns.regplot(data=q1_data,
            y=y_ax, x='process_size_nm', ci=95, scatter=False,
            color='xkcd:bordeaux', robust=True, label='Regression Line',
            ax=ax)

ax.set_ylabel(r"Die Size$(nm) \longrightarrow$")
ax.set_xlabel(r"Process Size$(nm) \longrightarrow$")
ax.set_title(r"Process Size$(nm)$ vs. Die Size$(mm^2)$")

ax.legend(ncols=2)

plt.show()

In [None]:
corr_df = df.loc[:,['process_size_nm','die_size_mm2','Type','rel_year','freq_mhz']].copy()
corr_df.Type = corr_df.Type.astype(str).map({'CPU': 0, 'GPU': 1})

corr = corr_df.corr()

ax = sns.heatmap(data=corr, center=0,
            square=True, linewidths=.4,
            annot=True, cmap='crest')

ax.set_xticklabels(['Process Size$(nm)$', 'Die Size$(mm^2)$','Type','Release Year','Frequency$(MHz)$'])
ax.set_yticklabels(['Process Size$(nm)$', 'Die Size$(mm^2)$','Type','Release Year','Frequency$(MHz)$'])
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

sns.scatterplot(df,x='rel_year',y='freq_mhz',hue='Type',markers='.',ax=ax)
sns.regplot(df,x='rel_year',y='freq_mhz',color='xkcd:pine green',scatter=False,ax=ax)

ax.set_xlabel('Release Year')
ax.set_ylabel('Frequency$(MHz)$')

plt.show()

In [None]:
df_fd = pd.DataFrame()
df_fd['Count'] = df.Foundry.value_counts().sort_values(ascending=False)
df_fd['Brand'] = df.Foundry.value_counts().sort_values(ascending=False).index

fig = px.pie(df_fd, values='Count', names='Brand', title='Chip Manufacturer Stakes')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

In [None]:
fig = px.scatter_3d(df, x='process_size_nm', y='die_size_mm2', z='transistors_10e6',
              color='Vendor')
fig.show()