In [9]:
#Problem 1:
import numpy as np
import pandas as pd

np.random.seed(42)

data = {
    "Age": np.random.randint(20, 60, 10),          
    "Income": [20000, np.nan, 30000, 45000, np.nan,60000, 35000, 50000, np.nan, 70000] 
}

df = pd.DataFrame(data)
print("Dataset:\n", df)



Dataset:
    Age   Income
0   58  20000.0
1   48      NaN
2   34  30000.0
3   27  45000.0
4   40      NaN
5   58  60000.0
6   38  35000.0
7   42  50000.0
8   30      NaN
9   30  70000.0


In [10]:
# (a) Mean of Income (ignore NaN)
mean_income = df["Income"].mean()
print("\nMean Income:", mean_income)



Mean Income: 44285.71428571428


In [11]:
# (b) Median of Income (ignore NaN)
median_income = df["Income"].median()
print("Median Income:", median_income)


Median Income: 45000.0


In [12]:
# (c) Age-Weighted Mean of Income
# Formula: sum(income * age) / sum(age), ignoring NaN
valid_data = df.dropna(subset=["Income"])
weighted_mean_income = np.average(valid_data["Income"], weights=valid_data["Age"])
print("Age-Weighted Mean Income:", weighted_mean_income)

Age-Weighted Mean Income: 43222.99651567944


In [16]:
#Problem 2:
import numpy as np
import pandas as pd

np.random.seed(42)
n = 12
ages = np.random.randint(20, 70, n)
incomes = np.random.randint(20000, 120000, n).astype(float)

df = pd.DataFrame({
    "Age": ages,
    "Income": incomes
})

# Insert some NaN values
df.loc[2, "Income"] = np.nan
df.loc[5, "Age"] = np.nan
df.loc[9, "Income"] = np.nan

print("Original Dataset:\n", df, "\n")


mean_income = df["Income"].mean()
std_income = df["Income"].std()


df["Income_z"] = (df["Income"] - mean_income) / std_income


df["Outlier"] = df["Income_z"].abs() > 3
num_outliers = df["Outlier"].sum()

print("Dataset with Z-scores and Outlier flag:\n", df, "\n")
print("Number of outliers (|z| > 3):", num_outliers)


Original Dataset:
      Age   Income
0   58.0  64131.0
1   48.0  80263.0
2   34.0      NaN
3   62.0  61090.0
4   27.0  87221.0
5    NaN  84820.0
6   58.0  20769.0
7   38.0  79735.0
8   42.0  82955.0
9   30.0      NaN
10  30.0  87969.0
11  43.0  25311.0 

Dataset with Z-scores and Outlier flag:
      Age   Income  Income_z  Outlier
0   58.0  64131.0 -0.131229    False
1   48.0  80263.0  0.511179    False
2   34.0      NaN       NaN    False
3   62.0  61090.0 -0.252328    False
4   27.0  87221.0  0.788260    False
5    NaN  84820.0  0.692647    False
6   58.0  20769.0 -1.857989    False
7   38.0  79735.0  0.490153    False
8   42.0  82955.0  0.618379    False
9   30.0      NaN       NaN    False
10  30.0  87969.0  0.818047    False
11  43.0  25311.0 -1.677118    False 

Number of outliers (|z| > 3): 0


In [17]:
#Problem 3:
import numpy as np
import pandas as pd

np.random.seed(42)

data = {
    "Age": np.random.randint(20, 60, size=12),       
    "Income": np.random.randint(30000, 120000, size=12).astype(float) 
}

df = pd.DataFrame(data)

# introduce some NaNs
df.loc[2, "Income"] = np.nan
df.loc[5, "Income"] = np.nan

print("Synthetic Dataset:")
print(df)

Synthetic Dataset:
    Age    Income
0    58   90263.0
1    48   46023.0
2    34       NaN
3    27   97221.0
4    40   94820.0
5    58       NaN
6    38   89735.0
7    42   92955.0
8    30   94925.0
9    30   97969.0
10   43   35311.0
11   55  113104.0


In [18]:
# define bins
bins = [18, 25, 35, 45, 60]
labels = ["[18–25)", "[25–35)", "[35–45)", "[45–60)"]

# cut age into bins
df["AgeBin"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)

print("\nDataset with Age Bins:")
print(df)


Dataset with Age Bins:
    Age    Income   AgeBin
0    58   90263.0  [45–60)
1    48   46023.0  [45–60)
2    34       NaN  [25–35)
3    27   97221.0  [25–35)
4    40   94820.0  [35–45)
5    58       NaN  [45–60)
6    38   89735.0  [35–45)
7    42   92955.0  [35–45)
8    30   94925.0  [25–35)
9    30   97969.0  [25–35)
10   43   35311.0  [35–45)
11   55  113104.0  [45–60)


In [19]:
# group and aggregate
result = (
    df.groupby("AgeBin")["Income"]
      .agg(count="count", mean="mean", median="median")
      .reset_index()
      .sort_values("AgeBin")
)

print("\nResult (Tidy DataFrame):")
print(result)



Result (Tidy DataFrame):
    AgeBin  count      mean   median
0  [18–25)      0       NaN      NaN
1  [25–35)      3  96705.00  97221.0
2  [35–45)      4  78205.25  91345.0
3  [45–60)      3  83130.00  90263.0


In [20]:
#Problem 4:
#Create Array
import numpy as np

# Create a 2D array (not 1D)
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])

print("Array:\n", arr)


Array:
 [[1 2 3]
 [4 5 6]
 [7 8 9]]


In [21]:
#Shape and Resize
# Shape → gives (rows, cols)
print("Shape:", arr.shape)

# Size → total number of elements
print("Size:", arr.size)

# Transpose → rows become columns
print("Transpose:\n", arr.T)

# Flatten → convert to 1D
print("Flatten:", arr.flatten())

Shape: (3, 3)
Size: 9
Transpose:
 [[1 4 7]
 [2 5 8]
 [3 6 9]]
Flatten: [1 2 3 4 5 6 7 8 9]


In [33]:
#Negative Indexing & Error in Slicing
# Negative indexing → last row
print("Last row using -1 indexing:", arr[-1])

# Error Example: slicing out of bounds
try:
    print(arr[:, 10])   # column 10 does not exist
except Exception as e:
    print("Error in slicing:", e)
    

Last row using -1 indexing: [7 8 9]
Error in slicing: index 10 is out of bounds for axis 1 with size 3


In [34]:
#Arithmetic Operations
# Broadcasting: add 10 to every element
print("Broadcasting (+10):\n", arr + 10)

# Dot Product (Matrix multiplication)
b = np.array([[1, 0, 1],
              [0, 1, 0],
              [1, 0, 1]])
print("Dot Product:\n", arr.dot(b))


Broadcasting (+10):
 [[11 12 13]
 [14 15 16]
 [17 18 19]]
Dot Product:
 [[ 4  2  4]
 [10  5 10]
 [16  8 16]]


In [32]:
#Linear Algebra (Determinant & Inverse)
# New invertible matrix
mat = np.array([[2, 1],
                [5, 3]])

# Determinant
det = np.linalg.det(mat)
print("Determinant:", det)

# Inverse
inv = np.linalg.inv(mat)
print("Inverse:\n", inv)


Determinant: 1.0000000000000002
Inverse:
 [[ 3. -1.]
 [-5.  2.]]
