In [32]:
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path

In [33]:
# ---------- 1) NUMPY OPERATIONS ----------
print("\n[a] NumPy: Creating an array and performing elementwise operations\n")

# Create a small array using linespace
arr0 = np.linspace(0, 10, 5)
print("Initial array:", arr0)

# elementwise operation
arr1 = arr0*2
print("Array multiplied by 2:", arr1)

# Elementwise vectorized operations
doubled = arr0 * 2 
print("Array multiplied by 2:", doubled)


[a] NumPy: Creating an array and performing elementwise operations

Initial array: [ 0.   2.5  5.   7.5 10. ]
Array multiplied by 2: [ 0.  5. 10. 15. 20.]
Array multiplied by 2: [ 0.  5. 10. 15. 20.]


In [34]:
print("\n[b] Compare loop vs vectorized execution on large array\n")
# Compare loop vs vectorized on a large array
N = 10000000
big_array = np.arange(N)

# Loop version
t0 = time.perf_counter()
loop_squared = [x**2 for x in big_array]
t1 = time.perf_counter()

# Vectorized version
t2 = time.perf_counter()
vec_squared = big_array ** 2
t3 = time.perf_counter()

print(f"Loop time (s):       {t1 - t0:.4f}")
print(f"Vectorized time (s): {t3 - t2:.4f}")


[b] Compare loop vs vectorized execution on large array

Loop time (s):       2.6072
Vectorized time (s): 0.0306


In [35]:
print("\n[2] Load CSV and inspect with .info() and .head()\n")
csv_path = "C:/Users/hinat/bootcamp_hina_tomar/homework/homework3/data/dataset.csv"
df = pd.read_csv(csv_path)

# .head(): by default the first 5 rows of the data
print("=== .head() ===")
df.columns = df.columns.str.strip() # strip whitespace and tabs from both ends without doing this it was coming with /t
print(df.head(), "\n")

# .info(): dtypes + non-null counts (prints to stdout)
print("=== .info() ===")
df.info()
print()  # blank line for readability


[2] Load CSV and inspect with .info() and .head()

=== .head() ===
     Name  Age   Salary
0    Hina   26  1500000
1   Mohit   26   600000
2  Sachin   25   750000
3   Rajat   25   550000
4  Shivam   21   900000 

=== .info() ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    10 non-null     object
 1   Age     10 non-null     int64 
 2   Salary  10 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 372.0+ bytes



In [39]:
print("\n[3] Summary statistics with .describe() and .groupby()\n")
print("\n=== Using .describe() ===")
summary= df.describe()
print(summary)

print("Column names:", list(df.columns)) # Show the actual columns for debugging

# group by age and calculate mean of salary
# Trim whitespace from columns to avoid leading/trailing space issues- this step is not required now becuase we have already stripped the spaces 
# But I am just keeping form my reference if I miss upwards then it can be done here
df_clean = df.rename(columns=lambda c: c.strip() if isinstance(c, str) else c) 
if "Age" in df_clean.columns:
    group_summary = df_clean.groupby("Age", dropna=False).mean(numeric_only=True)
    print("\n=== Using .groupby() on 'Age' ===")
    print(group_summary)
else:
    print("\nNo 'Age' column found for groupby example.")
    print("Available columns after stripping whitespace:", list(df_clean.columns))
    


[3] Summary statistics with .describe() and .groupby()


=== Using .describe() ===
             Age        Salary
count  10.000000  1.000000e+01
mean   22.200000  9.490000e+05
std     3.119829  3.620144e+05
min    18.000000  5.500000e+05
25%    20.250000  7.125000e+05
50%    21.500000  8.400000e+05
75%    25.000000  1.162500e+06
max    26.000000  1.560000e+06
Column names: ['Name', 'Age', 'Salary']

=== Using .groupby() on 'Age' ===
        Salary
Age           
18    975000.0
20   1560000.0
21    890000.0
22    800000.0
25    650000.0
26   1050000.0


In [44]:
print("\n[4] Save summary statistics\n")
# Ensure the processed directory exists

output_dir = Path('C:/Users/hinat/bootcamp_hina_tomar/homework/homework3/data/Processed')
output_dir.mkdir(parents=True, exist_ok=True)

summary.to_csv('C:/Users/hinat/bootcamp_hina_tomar/homework/homework3/data/Processed/summary.csv', index=True)  
# Keep index to show metric names like 'mean', 'std'

group_summary.to_json('C:/Users/hinat/bootcamp_hina_tomar/homework/homework3/data/Processed/group_summary.json', orient='table', indent=2)

# Bonus: simple plot
df.hist(figsize=(4,4))
plt.tight_layout()
plt.savefig("C:/Users/hinat/bootcamp_hina_tomar/homework/homework3/data/Processed/histogram.png")
plt.close()


[4] Save summary statistics

