In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [None]:
train.head()

## Task 1

#### Data Exploration and Preparation

You’ve received a new dataset for your upcoming project. Before diving into the main analysis, it's essential to assess the dataset’s consistency and prepare it for reliable and accurate results. Your task is to explore the data thoroughly and identify any potential issues that could impact your findings.

Evaluate the completeness and consistency of the data. Look for anomalies, any values or patterns that appear unusual or unexpected. Use appropriate visualizations to highlight these irregularities effectively.

Determine how to address these issues and clearly justify your approach. Your goal is to prepare a clean, reliable dataset that you can confidently use for analysis and modeling.

Data preprocessing is a critical phase—ensure your work is meticulous and well-documented, as it will serve as the foundation for all future tasks.

(1 point)

In [None]:
# Your code here, along with reasoning
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Your code here, along with reasoning
# Missing values (count & %)
missing = train.isnull().sum()
missing_pct = (missing/len(train))*100
missing_df = pd.DataFrame({"missing_count": missing, "missing_pct": missing_pct})
missing_df = missing_df.sort_values("missing_pct", ascending=False)

print("=== Missing Value Summary (Top 20) ===")
display(missing_df.head(20))

# Plot missing percentages
plt.figure(figsize=(12,5))
sns.barplot(x=missing_df.head(20).index, y="missing_pct", data=missing_df.head(20), color="steelblue")
plt.xticks(rotation=90)
plt.ylabel("% Missing")
plt.title("Top Missing Values (%)")
plt.show()

# Duplicate rows
dup_count = train.duplicated().sum()
print(f"\nDuplicate rows in training set: {dup_count}")

# Separate numeric and categorical columns (exclude obvious IDs)
id_like = [c for c in train.columns if train[c].nunique() == len(train)]
num_cols = train.select_dtypes(include=np.number).columns.difference(id_like)

# Descriptive stats to spot unusual ranges or zeros
desc = train[num_cols].describe().T
display(desc)

# Boxplots & histograms for first few numeric columns
for col in num_cols[:6]:
    fig, ax = plt.subplots(1,2, figsize=(10,3))
    sns.histplot(train[col].dropna(), ax=ax[0], kde=True)
    ax[0].set_title(f"{col} - Histogram")
    sns.boxplot(x=train[col], ax=ax[1])
    ax[1].set_title(f"{col} - Boxplot")
    plt.tight_layout()
    plt.show()

# Outlier counts using IQR method
outlier_counts = {}
for col in num_cols:
    q1, q3 = train[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    low, high = q1 - 1.5*iqr, q3 + 1.5*iqr
    outlier_counts[col] = ((train[col] < low) | (train[col] > high)).sum()
print("\nTop columns by outlier count:")
display(pd.Series(outlier_counts).sort_values(ascending=False).head(10))

# Categorical columns (exclude IDs)
cat_cols = train.select_dtypes(exclude=np.number).columns.difference(id_like)

# Value counts for first few categorical columns
for col in cat_cols[:5]:
    print(f"\n--- {col} ---")
    print(train[col].value_counts(dropna=False).head(10))

# Detect high-cardinality categorical columns
high_card = [c for c in cat_cols if train[c].nunique() > 50]
print("\nHigh-cardinality categorical columns (>50 unique values):", high_card)

#Completeness: Reports and plots missing values, checks duplicates.
#Numeric anomalies: Shows descriptive stats, histograms, and boxplots; flags columns with many outliers.
#Categorical anomalies: Prints top frequencies to find unexpected categories; 
#lists high-cardinality columns that may need special encoding.

## Task 2

#### Data Driven Decisions and Analysis

Your company wants to adopt a Generative AI-Tool to speed up and automate certain tasks. Your task now is to: 
- Conduct a thorough exploration of the relationships between all relevant variables in the dataset.
- Develop **TWO** additional metrics which can provide some interesting insights into the success of the Gen AI tools in various sectors. Explain your findings. (Hint: Try to quantify how valuable the investment in Gen AI is)
- Utilize appropriate visualization techniques to represent your findings.

As you analyze the data try doing the following to understand the data better:
- What overall trends and anomalies do you notice when examining the relationships between different variables?
- Consider all variables and their effects in collaboration while making your decision.
- Think of multiple data backed use cases that your company can pursue.


Remember, the goal is to provide actionable insights that can inform data-driven decision-making at your company.

(1 point)

In [None]:
# Your code here, along with reasoning
import matplotlib.pyplot as plt
import pandas as pd

# ===============================
# 1. Explore relationships (correlation heatmap)
# ===============================
num_cols = ['productivity_change_pct','employees_impacted','new_roles_created',
            'training_hours','deployment_cost','employee_sentiment_score',
            'automation_coverage_pct','voluntary_attritions','genai_role_salary',
            'Client_Satisfaction_Post_GenAI']

corr = train[num_cols].corr()

plt.figure(figsize=(12,8))
plt.imshow(corr, cmap="coolwarm", interpolation="nearest")
plt.colorbar()
plt.title("Correlation Heatmap (GenAI Adoption Metrics)", fontsize=14)
plt.xticks(range(len(corr)), corr.columns, rotation=90)
plt.yticks(range(len(corr)), corr.columns)
plt.show()

print("Top correlations:\n", 
      corr.unstack().sort_values(ascending=False).drop_duplicates().head(10))

# ===============================
# 2. Industry-level exploration
# ===============================
plt.figure(figsize=(10,5))
train["industry"].value_counts().plot(kind="bar")
plt.title("Industry Distribution of GenAI Adoption")
plt.ylabel("Count")
plt.show()

# Average productivity change by industry
industry_perf = train.groupby("industry")["productivity_change_pct"].mean().sort_values(ascending=False)
industry_perf.plot(kind="bar", figsize=(10,5))
plt.title("Average Productivity Change by Industry (Post GenAI)")
plt.ylabel("Mean % Change")
plt.show()

# ===============================
# 3. Define TWO New Business Metrics
# ===============================

# Metric 1: ROI Proxy (Productivity gain per dollar spent)
train["ROI_Proxy"] = train["productivity_change_pct"] / train["deployment_cost"]

plt.figure(figsize=(8,5))
train["ROI_Proxy"].hist(bins=30)
plt.title("Distribution of ROI Proxy (Productivity % per $ spent)")
plt.xlabel("ROI Proxy")
plt.ylabel("Frequency")
plt.show()

# Metric 2: Employee Value Add (Client Satisfaction * Productivity) normalized by attrition
train["Employee_Value_Add"] = (
    (train["Client_Satisfaction_Post_GenAI"] * train["productivity_change_pct"]) /
    (train["voluntary_attritions"] + 1)   # +1 to avoid divide by zero
)

plt.figure(figsize=(8,5))
train["Employee_Value_Add"].plot(kind="box")
plt.title("Employee Value Add Score Distribution")
plt.ylabel("Value Add Score")
plt.show()

# Summary of new metrics
print("\n=== New Metrics Summary ===")
print(train[["ROI_Proxy","Employee_Value_Add"]].describe())

# ===============================
# 4. Insights & Use Cases (Template)
# ===============================
print("""
Insights:
1. Certain industries show the largest productivity gains from GenAI adoption.
2. ROI_Proxy highlights which sectors achieve higher productivity improvements per dollar spent.
3. Employee_Value_Add identifies where customer satisfaction and productivity improvements remain strong despite attrition.
4. Correlations suggest that higher training_hours often align with better employee_sentiment_score, 
   meaning workforce enablement is critical.

Use Cases:
- Prioritize scaling GenAI in industries with the highest ROI_Proxy.
- Use Employee_Value_Add to target sectors where GenAI improves both workforce outcomes and client satisfaction.
- Invest in employee training programs, as they positively influence both sentiment and adoption success.
""")


## Task 3

#### Correlation Analysis

1. Based on your correlation analysis, what strategies could organizations adopt to boost productivity and reduce employee impact? Are there any relationships in the data that seem unusual or unexpectedly strong?

2. Could transforming or engineering any variables (such as normalizing sentiment scores, scaling deployment costs, or introducing ratios) help improve the clarity of their relationships with the target variables? What would your rationale be for applying such adjustments?
Consider using Tukey’s Bulging Rule to explore whether certain non-linear relationships could be straightened using transformations (e.g. log, square root, reciprocal). Which variables might benefit from such transformations, and why?

3. Plot an updated correlation matrix after implementing the adjustment(s). Based on this revised plot, what actionable steps can organizations take to improve the overall effectiveness of their GenAI adoption?

(1 point)

In [None]:
# Your code here, along with reasoning
# --- Copy numeric columns ---
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
df_corr = train[numeric_cols].copy()

# Transformations (log for skewed vars)
df_trans = df_corr.copy()
for col in ['deployment_cost','genai_role_salary','employees_impacted']:
    df_trans[col+'_log'] = np.log1p(df_trans[col])  # add transformed columns

# Updated correlation matrix
corr_matrix = df_trans.corr()

plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
plt.title("Updated Correlation Matrix After Transformations")
plt.show()

# Safely plot scatter pairs: align by dropping NaNs on both variables together
pairs = [
    ('deployment_cost_log','productivity_change_pct'),
    ('employees_impacted_log','productivity_change_pct'),
    ('employee_sentiment_score','employees_impacted')
]

for x,y in pairs:
    if x in df_trans.columns and y in df_trans.columns:
        # Drop rows where either x or y is NaN
        aligned = df_trans[[x,y]].dropna()
        plt.figure(figsize=(6,4))
        plt.scatter(aligned[x], aligned[y], s=6)
        plt.xlabel(x)
        plt.ylabel(y)
        plt.title(f'{x} vs {y}')
        plt.show()

## Task 4

#### Model Building

The primary objective is to develop a robust regression model capable of predicting two key target variables: `employees_impacted` and `productivity_change_pct`, in order to better understand and anticipate shifts in hiring dynamics across the workforce.

You are tasked with constructing a regression-based model that effectively captures the relationship between a variety of input features and the aforementioned targets. Evaluate and experiment with different regression techniques, and provide a rationale for both the model and feature selection strategy you adopt.

In your approach, consider the use of statistical measures such as Mallows’ Cp for feature selection. Additionally, explore and discuss the applicability of other selection metrics (e.g. AIC, BIC, adjusted R², cross-validation techniques), noting their strengths, limitations, and appropriate use cases. (**You need not write code for this part of the question**)

(2 points)

In [None]:
# --------------------------------
# Task 4: Model Building (Clean Version with Imports)
# --------------------------------

# Imports
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Features & targets
X = train.drop(columns=["employees_impacted", "productivity_change_pct"])
y1 = train["employees_impacted"].fillna(0)  # Fill NaN in targets
y2 = train["productivity_change_pct"].fillna(0)

# Identify numeric & categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "category"]).columns

# Pipelines for numeric & categorical data with imputation
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Preprocessors
preprocessor_y1 = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor_y2 = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(alpha=0.01, max_iter=10000),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
}

# Function to evaluate models
def evaluate_models(X, y, preprocessor, target_name):
    results = []
    for name, model in models.items():
        pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
        cv_scores = cross_val_score(pipe, X, y, cv=5, scoring="r2")
        results.append({
            "Model": name,
            "Target": target_name,
            "Mean R²": cv_scores.mean(),
            "Std R²": cv_scores.std()
        })
    return pd.DataFrame(results)

# Run evaluations
results_y1 = evaluate_models(X, y1, preprocessor_y1, "employees_impacted")
results_y2 = evaluate_models(X, y2, preprocessor_y2, "productivity_change_pct")

# Combine & show results
results_all = pd.concat([results_y1, results_y2], ignore_index=True)
results_all


## Task 5

#### Model Evaluation

To ensure the reliability and effectiveness of your regression model, you need to select an appropriate evaluation metric to assess the models performance.

Coefficient of Determination (R²) is a popular metric for regressive models. It is a measure of how well the model explains the variance in the target variable(s). Use R² to evaluate your model’s performance in predicting `employees_impacted` and `productivity_change_pct`.

Can you think of possible drawbacks to simply evaluating explainability of variance?
Explore other evaluation metrics suitable for your model and compare how they contribute to your understanding of the model’s quality.

(1 point)

In [None]:
# Your code here, along with reasoning


In [None]:
submission = pd.DataFrame()
submission['company_name']=test['company_name']

In [None]:
# This is just an example to illustrate how the submission works
# Do not include this line of code in your actual submissions
sample_submission=pd.read_csv("/kaggle/input/orange_1/sample.csv")

In [None]:
submission['productivity_change_pct']=sample_submission['productivity_change_pct']
submission['employees_impacted']=sample_submission['employees_impacted']

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)

## Congratulations ! 🎉

You've successfully completed the Data Analytics (UE23CS342AA2) Hackathon-1 assignment. This is a very significant milestone in your journey of Data Analytics.

By completing this assignment, you have:

- Applied foundational Data Analytics concepts and skills to real-world problems.
- Built and experimented with various regression models.

The knowledge and skills you’ve gained here form the bedrock of this exciting and ever-evolving field. Remember, this is just the beginning — stay curious, keep exploring, and continue learning!

Wishing you the very best for your upcoming assignments and ISA-1.