In [35]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# ============================================================================
# SECTION 1: LOAD AND EXPLORE DATASET
# ============================================================================

# Load the dataset
df = pd.read_csv('assignment_3_dataset.csv', index_col=0)

print("Dataset Shape:", df.shape)
print("\nFirst Few Rows:")
print(df.head(10))
print("\nData Types:")
print(df.dtypes)

# ============================================================================
# SECTION 2: DATA CLEANING
# ============================================================================

df_clean = df.copy()

# Replace '??' and empty strings with NaN
df_clean = df_clean.replace('??', np.nan)
df_clean = df_clean.replace('', np.nan)

# Standardize language names to lowercase
df_clean['lang'] = df_clean['lang'].str.lower()

# Convert numeric columns
numeric_columns = ['z1000t', 'z0t', 'z1000mem', 'stmtL', 'z1000rel', 'm1000rel', 'whours']
for col in numeric_columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

print("\nCleaned Dataset Summary:")
print(df_clean.describe())

# ============================================================================
# SECTION 3: PERFORMANCE vs RELIABILITY (z1000t vs z1000rel)
# ============================================================================

df_perf_rel = df_clean[['person', 'lang', 'z1000t', 'z1000rel']].dropna()

print("\n" + "="*80)
print("PERFORMANCE vs RELIABILITY ANALYSIS")
print("="*80)
print(f"Records: {len(df_perf_rel)}")
print(f"Overall Correlation: {df_perf_rel['z1000t'].corr(df_perf_rel['z1000rel']):.4f}")

# Create scatter plot: z1000t vs z1000rel
fig1 = px.scatter(
    df_perf_rel,
    x='z1000t',
    y='z1000rel',
    color='lang',
    hover_data=['person'],
    title='Performance vs Reliability: z1000t vs z1000rel',
    labels={'z1000t': 'z1000t (Execution Time)', 'z1000rel': 'z1000rel (%)'},
    trendline='ols',
    trendline_color_override='red',
    template='plotly_white'
)
fig1.update_layout(height=600, width=1000)
fig1.show()

# By-language scatter plots
languages = sorted(df_perf_rel['lang'].unique())
n_langs = len(languages)
cols = 3
rows = (n_langs + cols - 1) // cols

fig2 = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=[f'{lang.upper()}' for lang in languages],
    specs=[[{'type': 'scatter'} for _ in range(cols)] for _ in range(rows)]
)

colors = px.colors.qualitative.Plotly

for idx, lang in enumerate(languages):
    row = idx // cols + 1
    col = idx % cols + 1
    lang_data = df_perf_rel[df_perf_rel['lang'] == lang]

    fig2.add_trace(
        go.Scatter(
            x=lang_data['z1000t'], y=lang_data['z1000rel'],
            mode='markers', marker=dict(size=8, color=colors[idx % len(colors)]),
            name=lang.upper(), text=lang_data['person'],
            hovertemplate='<b>%{text}</b><br>z1000t: %{x:.2f}<br>z1000rel: %{y:.2f}<extra></extra>',
            showlegend=False
        ), row=row, col=col
    )

    if len(lang_data) > 1:
        z = np.polyfit(lang_data['z1000t'], lang_data['z1000rel'], 1)
        p = np.poly1d(z)
        x_trend = np.linspace(lang_data['z1000t'].min(), lang_data['z1000t'].max(), 100)
        fig2.add_trace(
            go.Scatter(x=x_trend, y=p(x_trend), mode='lines',
                      line=dict(color='red', width=2, dash='dash'),
                      showlegend=False, hoverinfo='skip'),
            row=row, col=col
        )

fig2.update_layout(height=400*rows, width=1200, title_text='Performance vs Reliability by Language', showlegend=False)
fig2.show()

# ============================================================================
# SECTION 4: MEMORY vs RELIABILITY (z1000mem vs m1000rel)
# ============================================================================

df_mem_rel = df_clean[['person', 'lang', 'z1000mem', 'm1000rel']].dropna()

print("\n" + "="*80)
print("MEMORY vs RELIABILITY ANALYSIS")
print("="*80)
print(f"Records: {len(df_mem_rel)}")
print(f"Overall Correlation: {df_mem_rel['z1000mem'].corr(df_mem_rel['m1000rel']):.4f}")

# Create scatter plot: z1000mem vs m1000rel
fig3 = px.scatter(
    df_mem_rel,
    x='z1000mem',
    y='m1000rel',
    color='lang',
    hover_data=['person'],
    title='Memory vs Reliability: z1000mem vs m1000rel',
    labels={'z1000mem': 'z1000mem (Memory)', 'm1000rel': 'm1000rel (%)'},
    trendline='ols',
    trendline_color_override='red',
    template='plotly_white'
)
fig3.update_layout(height=600, width=1000)
fig3.show()

# By-language scatter plots for memory
fig4 = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=[f'{lang.upper()}' for lang in languages],
    specs=[[{'type': 'scatter'} for _ in range(cols)] for _ in range(rows)]
)

for idx, lang in enumerate(languages):
    row = idx // cols + 1
    col = idx % cols + 1
    lang_data = df_mem_rel[df_mem_rel['lang'] == lang]

    fig4.add_trace(
        go.Scatter(
            x=lang_data['z1000mem'], y=lang_data['m1000rel'],
            mode='markers', marker=dict(size=8, color=colors[idx % len(colors)]),
            name=lang.upper(), text=lang_data['person'],
            hovertemplate='<b>%{text}</b><br>z1000mem: %{x:.0f}<br>m1000rel: %{y:.2f}<extra></extra>',
            showlegend=False
        ), row=row, col=col
    )

    if len(lang_data) > 1:
        z = np.polyfit(lang_data['z1000mem'], lang_data['m1000rel'], 1)
        p = np.poly1d(z)
        x_trend = np.linspace(lang_data['z1000mem'].min(), lang_data['z1000mem'].max(), 100)
        fig4.add_trace(
            go.Scatter(x=x_trend, y=p(x_trend), mode='lines',
                      line=dict(color='red', width=2, dash='dash'),
                      showlegend=False, hoverinfo='skip'),
            row=row, col=col
        )

fig4.update_layout(height=400*rows, width=1200, title_text='Memory vs Reliability by Language', showlegend=False)
fig4.show()

# ============================================================================
# SECTION 5: OUTLIER DETECTION
# ============================================================================

def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data < lower_bound) | (data > upper_bound), lower_bound, upper_bound

# Analyze z1000t outliers
df_z1000t = df_clean[['person', 'lang', 'z1000t']].dropna()
outliers_t_iqr, lb_t, ub_t = detect_outliers_iqr(df_z1000t['z1000t'])

print("\n" + "="*80)
print("OUTLIER DETECTION: z1000t (Execution Time)")
print("="*80)
print(f"Total records: {len(df_z1000t)}")
print(f"IQR Bounds: [{lb_t:.2f}, {ub_t:.2f}]")
print(f"Outliers detected: {outliers_t_iqr.sum()} ({100*outliers_t_iqr.sum()/len(df_z1000t):.1f}%)")
print("\nExtreme Values:")
for idx, row in df_z1000t[outliers_t_iqr].sort_values('z1000t', ascending=False).iterrows():
    print(f"  {row['person']} ({row['lang']}): {row['z1000t']:.2f}")

# Box plot for z1000t
fig5 = px.box(df_z1000t, x='lang', y='z1000t',
              title='Distribution of z1000t by Language',
              labels={'z1000t': 'z1000t (Execution Time)', 'lang': 'Language'},
              template='plotly_white')
fig5.update_layout(height=600, width=1000)
fig5.show()

# Histogram for z1000t
fig6 = px.histogram(df_z1000t, x='z1000t', nbins=30, color='lang',
                   title='Histogram of z1000t with Outlier Bounds',
                   labels={'z1000t': 'z1000t (Execution Time)'},
                   barmode='group', template='plotly_white')
fig6.add_vline(x=ub_t, line_dash="dash", line_color="red", annotation_text=f"Upper: {ub_t:.2f}")
fig6.add_vline(x=lb_t, line_dash="dash", line_color="red", annotation_text=f"Lower: {lb_t:.2f}")
fig6.update_layout(height=600, width=1000)
fig6.show()

# Analyze z1000mem outliers
df_z1000mem = df_clean[['person', 'lang', 'z1000mem']].dropna()
outliers_mem_iqr, lb_mem, ub_mem = detect_outliers_iqr(df_z1000mem['z1000mem'])

print("\n" + "="*80)
print("OUTLIER DETECTION: z1000mem (Memory Usage)")
print("="*80)
print(f"Total records: {len(df_z1000mem)}")
print(f"IQR Bounds: [{lb_mem:.0f}, {ub_mem:.0f}]")
print(f"Outliers detected: {outliers_mem_iqr.sum()} ({100*outliers_mem_iqr.sum()/len(df_z1000mem):.1f}%)")
print("\nExtreme Values:")
for idx, row in df_z1000mem[outliers_mem_iqr].sort_values('z1000mem', ascending=False).iterrows():
    print(f"  {row['person']} ({row['lang']}): {row['z1000mem']:.0f}")

# Box plot for z1000mem
fig7 = px.box(df_z1000mem, x='lang', y='z1000mem',
              title='Distribution of z1000mem by Language',
              labels={'z1000mem': 'z1000mem (Memory)', 'lang': 'Language'},
              template='plotly_white')
fig7.update_layout(height=600, width=1000)
fig7.show()

# Histogram for z1000mem
fig8 = px.histogram(df_z1000mem, x='z1000mem', nbins=30, color='lang',
                   title='Histogram of z1000mem with Outlier Bounds',
                   labels={'z1000mem': 'z1000mem (Memory)'},
                   barmode='group', template='plotly_white')
fig8.add_vline(x=ub_mem, line_dash="dash", line_color="red", annotation_text=f"Upper: {ub_mem:.0f}")
fig8.add_vline(x=lb_mem, line_dash="dash", line_color="red", annotation_text=f"Lower: {lb_mem:.0f}")
fig8.update_layout(height=600, width=1000)
fig8.show()

# Scatter plots with outliers highlighted
df_analysis = df_clean[['person', 'lang', 'z1000t', 'z1000rel']].dropna()
df_analysis['is_outlier'] = detect_outliers_iqr(df_analysis['z1000t'])[0]
df_analysis['is_outlier_str'] = df_analysis['is_outlier'].map({True: 'Outlier', False: 'Normal'})

fig9 = px.scatter(df_analysis, x='z1000t', y='z1000rel', color='is_outlier_str',
                  hover_data=['person', 'lang'],
                  title='z1000t vs z1000rel with Outliers',
                  color_discrete_map={'Outlier': 'red', 'Normal': 'blue'},
                  template='plotly_white')
fig9.update_traces(marker=dict(size=10))
fig9.update_layout(height=600, width=1000)
fig9.show()

df_analysis_mem = df_clean[['person', 'lang', 'z1000mem', 'm1000rel']].dropna()
df_analysis_mem['is_outlier'] = detect_outliers_iqr(df_analysis_mem['z1000mem'])[0]
df_analysis_mem['is_outlier_str'] = df_analysis_mem['is_outlier'].map({True: 'Outlier', False: 'Normal'})

fig10 = px.scatter(df_analysis_mem, x='z1000mem', y='m1000rel', color='is_outlier_str',
                   hover_data=['person', 'lang'],
                   title='z1000mem vs m1000rel with Outliers',
                   color_discrete_map={'Outlier': 'red', 'Normal': 'blue'},
                   template='plotly_white')
fig10.update_traces(marker=dict(size=10))
fig10.update_layout(height=600, width=1000)
fig10.show()

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)

Dataset Shape: (80, 10)

First Few Rows:
  person lang  z1000t    z0t  z1000mem  stmtL  z1000rel  m1000rel  whours caps
0   s018    c   0.017  0.017     22432   16.1     98.10      96.8    16.1   ??
1   s030    C     NaN  0.033     16968    4.0     76.47      92.1     4.0   ??
2   s036    c  20.900  0.000     11440    8.2      0.00      89.5     8.2   ??
3   s066    C   0.750  0.467      2952    7.3     98.48     100.0     7.3   ??
4   s078    c   0.050  0.050     22496   10.9     99.24      98.4    10.9   ??
5   s015  c++   0.050  0.050     24616   11.2     99.24     100.0    11.2   ??
6   s020  C++   1.983  0.550      6384    3.0     98.48      98.4     3.0   ??
7   s021  c++   4.867  0.017      5312    NaN    100.00      98.4     NaN   ??
8   s025  c++   0.083  0.083     28568    3.5     99.24      98.4     3.5   ??
9   s027  c++   1.533  0.000      3472   25.3     98.09     100.0    25.3   ??

Data Types:
person       object
lang         object
z1000t      float64
z0t         float


MEMORY vs RELIABILITY ANALYSIS
Records: 80
Overall Correlation: -0.3416



OUTLIER DETECTION: z1000t (Execution Time)
Total records: 75
IQR Bounds: [-2.64, 4.93]
Outliers detected: 14 (18.7%)

Extreme Values:
  s149408 (tcl): 202.80
  s149209 (python): 72.30
  s149113 (perl): 67.50
  s093 (java): 37.10
  s068 (java): 31.20
  s072 (java): 30.10
  s149410 (tcl): 29.40
  s149302 (rexx): 25.40
  s149102 (perl): 21.40
  s034 (c++): 21.40
  s149303 (rexx): 21.00
  s036 (c): 20.90
  s062 (java): 16.80
  s047 (java): 6.47



OUTLIER DETECTION: z1000mem (Memory Usage)
Total records: 80
IQR Bounds: [-17576, 75240]
Outliers detected: 3 (3.8%)

Extreme Values:
  s149211 (python): 91120
  s023 (java): 89664
  s081 (java): 79544



ANALYSIS COMPLETE
