In [10]:
import pandas as pd
import numpy as np
from scipy import stats

### Month 2 - exams


#### Question 1 - Pandas for Data Analysis

In [11]:
# a. Date Loading, Datetime Conversion and Feature extraction

# loading data from Github link
url = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/eletronic_sales.csv"

df = pd.read_csv(url)

# converting Date column to csv
df['Date'] = pd.to_datetime(df["Date"])

# creating new column for year, month, day and day_of the week
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
df["Day_of_Week"] = df["Date"].dt.day_name()
df.head()

Unnamed: 0,Date,Branch,Sales Agent,Products,Units,Price,Year,Month,Day,Day_of_Week
0,2014-09-01,Woji,Chinedu,Apple,2,125.0,2014,9,1,Monday
1,2015-06-17,Woji,Emeka,Apple,5,125.0,2015,6,17,Wednesday
2,2015-09-10,Woji,Ibrahim,Lenovo,7,1.29,2015,9,10,Thursday
3,2015-11-17,Woji,Tolu,HP,11,4.99,2015,11,17,Tuesday
4,2015-10-31,Woji,Tonye,Lenovo,14,1.29,2015,10,31,Saturday


In [13]:
# b. Branch-level Total  Sales

# calculating for total sales
df["Total_Sales"] = df["Units"] * df["Price"]

# Returning a new data frame showing Branch and Total_sales
branch_sales = df.groupby("Branch")["Total_Sales"].sum().reset_index()
branch_sales


Unnamed: 0,Branch,Total_Sales
0,GRA,6002.09
1,Town,2486.72
2,Woji,11139.07


In [16]:
# c. Top Performing sales Agent

# Deteming the top sales agent
top_sales_agents = df.groupby('Sales Agent')['Total_Sales'].sum().reset_index()
top_sales_agents  = top_sales_agents.loc[top_sales_agents['Total_Sales'].idxmax()]
top_sales_agents


Sales Agent      Emeka
Total_Sales    3109.44
Name: 3, dtype: object

In [18]:
# d. Introduction and filling Missing Values

# Introducing missing values at specific row (5, 15, 25)
df.loc[[5, 15, 25], 'Price'] = np.nan
print("DataFrame with introduced missing values (rows 5, 15, 25):")

# Calculating the median of the 'Price' column
price_median = df['Price'].median()
print(f"\nMedian of the 'Price' column: {price_median}")

# Filling the missing values with the calculated median
df.fillna({'Price': price_median}, inplace=True)
print("\nDataFrame after filling missing values:\n")
df.iloc[[5, 15, 25]]

DataFrame with introduced missing values (rows 5, 15, 25):

Median of the 'Price' column: 4.99

DataFrame after filling missing values:



Unnamed: 0,Date,Branch,Sales Agent,Products,Units,Price,Year,Month,Day,Day_of_Week,Total_Sales
5,2014-02-26,Woji,Ibrahim,Compaq,27,4.99,2014,2,26,Wednesday,539.73
15,2015-04-10,Woji,Tonye,Lenovo,66,4.99,2015,4,10,Friday,131.34
25,2014-11-08,GRA,Chioma,Compaq,15,4.99,2014,11,8,Saturday,299.85


In [20]:
# e. Product-Level Summary

# Group by 'Product' and aggregate to find mean price and sum of units
summary_df = df.groupby('Products').agg(Average_Price=('Price', 'mean'), Total_Units_Sold=('Units', 'sum')
).reset_index()
summary_df

Unnamed: 0,Products,Average_Price,Total_Units_Sold
0,Apple,175.0,10
1,Compaq,5.19,278
2,Dell,11.912857,395
3,HP,11.524,722
4,Lenovo,3.005385,716


#### Question2 - Numpy for numeric computation

In [22]:
# a. Array Creation and Basic Manipulation

# Creating a NumPy array containing 20 random integers between 10 and 100.
random_array_number = np.random.randint(10, 101, size=20)

# Reshaping the array into a 4x5 matrix.
arr_reshaped = random_array_number.reshape(4, 5)

# Extracting the first two rows and last three columns from the reshaped array.
extract = arr_reshaped[:2, -3:]

# Compute the mean and standard deviation of the entire array.
mean_value = random_array_number.mean()
std_value = random_array_number.std()

print("Random array number:\n", random_array_number)
print("Reshaped array:\n", arr_reshaped)
print("Extrated array:\n", extract)

print("Mean:\n", mean_value)
print("Standard deviation:\n", std_value)

Random array number:
 [94 94 80 96 29 57 35 66 59 70 47 30 39 41 26 29 77 61 50 18]
Reshaped array:
 [[94 94 80 96 29]
 [57 35 66 59 70]
 [47 30 39 41 26]
 [29 77 61 50 18]]
Extrated array:
 [[80 96 29]
 [66 59 70]]
Mean:
 54.9
Standard deviation:
 23.834638658893066


In [None]:
# b. Operations on 2D Arrays

# Simulating a 2D array representing students' scores in 5 subjects (10 students).
student_scores = np.random.randint(40, 101, size=(10, 5))

# Calculating the average score per student.
avg_per_student = student_scores.mean(axis=1)

# Determining the highest and lowest score in the dataset.
highest_score = student_scores.max()
lowest_score = student_scores.min()

print("Scores:\n", student_scores)
print("\nAverage score per student:\n", avg_per_student)
print(f"\nHighest score: {highest_score}, Lowest score: {lowest_score}")

In [23]:
# c. Working with 3D Arrays

arr_3d = np.random.randint(1, 21, size=(3, 4, 2))

# Finding the sum of elements across the second axis (axis=1)
sum_axis1 = arr_3d.sum(axis=1)

# Compute the maximum value along each layer (axis=2)
max_per_layer = arr_3d.max(axis=2)

# Flatten the entire 3D array into a 1D array
flattened = arr_3d.flatten()
print("3D Array:\n", arr_3d)
print("\nSum across the second axis:\n", sum_axis1)
print("\nMaximum value along each layer:\n", max_per_layer)
print("\nFlattened array:\n", flattened)

3D Array:
 [[[14  7]
  [12 12]
  [ 3  6]
  [ 2 15]]

 [[ 8  5]
  [13 19]
  [ 8 11]
  [14 13]]

 [[18  4]
  [14 10]
  [ 3  7]
  [19  2]]]

Sum across the second axis:
 [[31 40]
 [43 48]
 [54 23]]

Maximum value along each layer:
 [[14 12  6 15]
 [ 8 19 11 14]
 [18 14  7 19]]

Flattened array:
 [14  7 12 12  3  6  2 15  8  5 13 19  8 11 14 13 18  4 14 10  3  7 19  2]


#### Question3 - Statistics for statistical analysis

**a. Measures of Center and Spread**

Given the dataset of $CO_2$ emissions (in metric tons per capita) from five countries: [25.4, 30.2, 22.5, 28.1, 35.0]

- **a Compute the mean, median, and mode**.

1.  *Mean (Average):*
    To find the mean, I'll sum all the values and divide by the number of values (which is 5).
    $$
    \text{Mean} (\bar{x}) = \frac{25.4 + 30.2 + 22.5 + 28.1 + 35.0}{5} = \frac{141.2}{5} = 28.24
    $$
2.  *Median (Middle Value):*
    First, I need to sort the data in ascending order.

    Sorted data: [22.5, 25.4, 28.1, 30.2, 35.0]

    The median  the middle one that is the 3rd value.
    $$
    \text{Median} = 28.1
    $$

3.  *Mode (The value that appear most in the dataset):*

    Every value appears only once.

    $$
    \text{Mode} = \text{No mode}
    $$

- **b Determine the range and standard deviation.**
1.  *Range:*
    The range is the difference between the maximum and minimum values in the dataset.
    * Maximum value = 35.0
    * Minimum value = 22.5
    $$
    \text{Range} = 35.0 - 22.5 = 12.5
    $$

2.  *Standard Deviation:*
    This measures how spread out the numbers are from the mean. $s = \sqrt{\frac{\sum(x_i - \bar{x})^2}{n-1}}$. where mean ($\bar{x}$) is 28.24.

    | $x_i$ | $x_i - \bar{x}$ | $(x_i - \bar{x})^2$ |
    | :---: | :---: | :---: |
    | 25.4  | -2.84 | 8.0656  |
    | 30.2  | 1.96  | 3.8416  |
    | 22.5  | -5.74 | 32.9476 |
    | 28.1  | -0.14 | 0.0196  |
    | 35.0  | 6.76  | 45.6976 |
    | *Total* | | *90.572* |
   
    $$
    s = \sqrt{\frac{90.572}{5-1}} = \sqrt{\frac{90.572}{4}} = \sqrt{22.643} \approx 4.758
    $$
    The standard deviation is approximately *4.76*.

- **c. Comment briefly on the spread of the data.**

The *range* of the data is *12.5*. This indicates a moderate spread in the CO2 emissions data. The data points are somewhat dispersed around the mean value of 28.24, but not extremely so.

The *standard deviation* is *4.76*. This indicate how data are spread from each other
***
**b. Hypothesis Testing**
We are given two samples of beef consumption (kg/person/year) and asked to perform a two-sample t-test at a 5% significance level ($\alpha=0.05$).
* *Argentina:* [60, 62, 58, 63, 59]
* *Bangladesh:* [15, 12, 18, 14, 16]
- **(a) State the null hypothesis ($H_0$) and the alternative hypothesis ($H_1$) clearly.**
$\mu_A$ is the true mean beef consumption for Argentina

$\mu_B$ is the true mean beef consumption for Bangladesh.
* *Null Hypothesis ($H_0$):* There is no significant difference in the mean beef consumption between Argentina and Bangladesh.
    $$ H_0: \mu_A = \mu_B $$
* *Alternative Hypothesis ($H_1$):* There is a significant difference in the mean beef consumption between Argentina and Bangladesh.
    $$ H_1: \mu_A \neq \mu_B $$
    
- **(b) Compute the t-statistic and the p-value using your notebook.**
To compute this, I would use a Python library like scipy.stats. The code would look something like this:

After running this code in the notebook, the following results are obtained:
t-statistic: ≈33.29
p-value: ≈1.39×10−8 (which is 0.0000000139)


- **(c) State your conclusion based on the p-value.**

Compare p-value to significance level (α): We need to check if our p-value is less than our significance level of 0.05. $$ 1.39 \times 10^{-8} < 0.05 $$ The p-value is indeed much smaller than the significance level.
Conclusion: Since the p-value is less than 0.05, we reject the null hypothesis (H0​). This provides strong evidence that the observed difference in mean beef consumption between Argentina and Bangladesh is not due to random chance. Therefore, we can conclude that there is a statistically significant difference in beef consumption between the two countries.

#### Question4 - Linear Algebra

In [24]:
# a. Total Scores per Student
# Compute the total score for each student by summing the elements in each row. Present your result as a $4 \times 1$ column vector.
A = np.array([[80, 70, 90],
                [60, 85, 75],
                [95, 88, 92],
                [70, 60, 65]])
total_scores = A.sum(axis=1).reshape(-1, 1)
total_scores

array([[240],
       [220],
       [275],
       [195]])

In [25]:
# b. Average Score per Subject
# Compute the average score for each subject by calculating the mean of each column of matrix $A$. Present your result as a $1 \times 3$ row vector representing the averages for Math, English, and Science.
average_scores = A.mean(axis=0).reshape(1, -1)
average_scores


array([[76.25, 75.75, 80.5 ]])

In [26]:
# c. Weighted Final Grades
# The subjects have importance weights given by the vector $w = [0.5, 0.3, 0.2]$. Use matrix multiplication to compute each student's weighted final grade. The operation is $G = A w^T$. Show the resulting column vector $G$.
weight = np.array([0.5, 0.3, 0.2])
grade = A @ weight.reshape(-1, 1)
grade


array([[79. ],
       [70.5],
       [92.3],
       [66. ]])

In [27]:

# d. Applying Subject Importance

# (a) Multiply the Math column (first column) by 2 to create A'
A_prime = A.copy()
A_prime[:, 0] = 2 * A_prime[:, 0]

# (b) Recompute the total score for each student using A'
total_scores_prime = A_prime.sum(axis=1).reshape(-1, 1)
print("Original total scores per student:\n", total_scores)
print("\nNew total scores per student with Math weighted double:\n", total_scores_prime)

# (c) Brief discussion:
# The new totals are higher for all students because the Math scores now contribute twice as much.
# Students with higher Math scores benefit more from this weighting.


Original total scores per student:
 [[240]
 [220]
 [275]
 [195]]

New total scores per student with Math weighted double:
 [[320]
 [280]
 [370]
 [265]]
