Standardization


In [1]:
import pandas as pd
import numpy as np
import random 
from scipy import stats
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [None]:
actual = pd.DataFrame()
actual["value"] = np.random.randint(10,50,10)
actual["predicted"] = np.random.randint(10,50,10)

In [3]:
x_min = actual["value"].min()
mean = actual["value"].mean()
standard = actual["value"].std()
normalization_denominator = (actual["value"].max()) - x_min
count_df = len(actual["value"])

In [4]:
def standardization_function(x):
  return (x - mean)/standard

In [5]:
actual["standardization"] = actual["value"].apply(standardization_function)

Normalization

In [6]:
def normalization_function(x):
  return (x - x_min)/normalization_denominator

In [7]:
actual["normalization"] = actual["value"].apply(normalization_function)

In [8]:
print(actual.head(10))

   value  predicted  standardization  normalization
0     36         45         0.571593       0.617647
1     24         34        -0.590969       0.264706
2     17         24        -1.269129       0.058824
3     33         11         0.280952       0.529412
4     37         47         0.668473       0.647059
5     15         12        -1.462890       0.000000
6     27         15        -0.300328       0.352941
7     49         31         1.831034       1.000000
8     37         45         0.668473       0.647059
9     26         15        -0.397208       0.323529


Variance

In [9]:
def variance(x):
    return ((x - mean)**2)/count_df

In [10]:
actual["variance"] = actual["value"].apply(variance)

Standard Deviation

In [11]:
def standard_deviation(x):
    return (((x - mean)**2)/count_df)**1/2

In [12]:
actual["standard_deviation"] = actual["value"].apply(standard_deviation)

In [13]:
print(actual.head(10))

   value  predicted  standardization  normalization  variance  \
0     36         45         0.571593       0.617647     3.481   
1     24         34        -0.590969       0.264706     3.721   
2     17         24        -1.269129       0.058824    17.161   
3     33         11         0.280952       0.529412     0.841   
4     37         47         0.668473       0.647059     4.761   
5     15         12        -1.462890       0.000000    22.801   
6     27         15        -0.300328       0.352941     0.961   
7     49         31         1.831034       1.000000    35.721   
8     37         45         0.668473       0.647059     4.761   
9     26         15        -0.397208       0.323529     1.681   

   standard_deviation  
0              1.7405  
1              1.8605  
2              8.5805  
3              0.4205  
4              2.3805  
5             11.4005  
6              0.4805  
7             17.8605  
8              2.3805  
9              0.8405  


Mean Absolute Error

In [14]:
def mean_absolute_error(x):
    return abs(x["value"] - x["predicted"]).mean()

In [15]:
actual["mean_absolute_error"] = actual.apply(mean_absolute_error,axis = 1)

In [16]:
print(actual.head(10))

   value  predicted  standardization  normalization  variance  \
0     36         45         0.571593       0.617647     3.481   
1     24         34        -0.590969       0.264706     3.721   
2     17         24        -1.269129       0.058824    17.161   
3     33         11         0.280952       0.529412     0.841   
4     37         47         0.668473       0.647059     4.761   
5     15         12        -1.462890       0.000000    22.801   
6     27         15        -0.300328       0.352941     0.961   
7     49         31         1.831034       1.000000    35.721   
8     37         45         0.668473       0.647059     4.761   
9     26         15        -0.397208       0.323529     1.681   

   standard_deviation  mean_absolute_error  
0              1.7405                  9.0  
1              1.8605                 10.0  
2              8.5805                  7.0  
3              0.4205                 22.0  
4              2.3805                 10.0  
5             

Random Sampling

In [2]:
salary = pd.DataFrame(np.random.randint(20000,80000,100))

In [18]:
def averageOfgroups(df,size):
    average_of_groups = []
    for i in range(0,size):
        average_of_groups.append(np.mean(df[i]))
    return np.mean(average_of_groups)

Without replacement

In [19]:
def without_replacement(data):
    groups = []
    for i in range(0, len(data), 15):
        groups.append(data[i:i + 15])
    return groups

In [20]:
group_without_replacement = without_replacement(salary[0])

In [21]:
print(averageOfgroups(group_without_replacement,7))

46785.438095238096


In [7]:
salary[0].sample(8)

25    22561
5     72618
87    46370
41    79089
56    65415
96    35246
0     36468
59    60511
Name: 0, dtype: int32

With Replacement

In [22]:
def with_replacement(data):
    groups = []
    for i in range(15):
        row = []
        for j in range(30):
            row.append(data[np.random.randint(0, len(data))])
        groups.append(row)
    return groups

In [23]:
group_with_replacement = with_replacement(salary[0])

In [24]:
print(averageOfgroups(group_with_replacement,15))

46782.58222222222


Built in function

In [6]:
salary[0].sample(30,replace=True)

84    44293
88    24387
46    61085
15    49031
60    51682
12    68397
95    26066
64    53742
64    53742
64    53742
75    38340
64    53742
28    76407
67    60101
77    45040
99    30192
1     42232
5     72618
46    61085
42    43394
47    40385
96    35246
15    49031
57    77024
40    26386
5     72618
6     31800
67    60101
89    59770
37    69074
Name: 0, dtype: int32

left tailed test
A factory produces wires with an average tensile strength of 500 N. A new supplier sends cheaper raw material. The engineer wants to check if the tensile strength has decreased.

H₀: μ = 500

Hₐ: μ < 500 (left-tailed test)

In [3]:
battery_df=pd.DataFrame(np.random.randint(0,10,30))

In [None]:
n=30
population_mean=30
std_population=3
alpha=0.05
sample_mean=battery_df[0].mean()

In [None]:
z_sample = (sample_mean - population_mean) / (std_population / (n**0.5))
z_critical = stats.norm.ppf(alpha)
if z_sample < z_critical:
    print("Reject null hypothesis")
else:
    print("Fail to reject null hypothesis")

A beverage company claims their soda cans contain 330 ml on average. A quality analyst wants to check if the filling machine deviates (could be more or less).

H₀: μ = 330

Hₐ: μ ≠ 330 (two-tailed test)

In [None]:
z_sample = (sample_mean - population_mean) / (std_population / (n**0.5))
z_critical_right = stats.norm.ppf(1 - alpha/2)
z_critical_left = stats.norm.ppf(alpha/2)
if z_critical_left > z_sample > z_critical_right:
        print("Reject null hypothesis")
else:
    print("Fail to reject null hypothesis")

In [4]:
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

NameError: name 'mean_squared_error' is not defined

In [None]:
z_scores = stats.zscore(data)

outliers = data[abs(z_scores) > 3]

print(outliers)

In [None]:
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df)

print(df_standardized)

In [None]:
MinMaxScaler