In [7]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# 1. Sample

In [12]:
#SAMPLE
# Specify the path to your .xlsx file
file_path = "C:\\Khalid\\Kuliah\\Statistika Industri\\Statistika-Industri-\\sample_wildfire_weather_data.csv"

# Read the CSV file
df = pd.read_csv(file_path)

df.head()
# #POPULASI
# file_path = "C://Users//LENOVO//Downloads//STATIN_DATA\wildfire_weather_data.xlsx"

# # Read the Excel file
# df = pd.read_excel(file_path, engine='openpyxl')

# # Display the DataFrame
# df.head()

Unnamed: 0,Region,Estimated_fire_area,Temperature_Mean,WindSpeed_Mean
0,WA,639.982,23.390953,3.551176
1,WA,593.569167,23.55262,4.0917
2,WA,37.345455,14.617797,4.637551
3,WA,544.986486,25.227217,4.334514
4,WA,35.8236,19.427573,4.840414


# Sample Temperature

In [13]:
temp_df = df[['Region','Temperature_Mean']]
print(temp_df)

    Region  Temperature_Mean
0       WA         23.390953
1       WA         23.552620
2       WA         14.617797
3       WA         25.227217
4       WA         19.427573
..     ...               ...
387     TA          8.267706
388     TA         19.759433
389     TA         10.675490
390     TA          9.040038
391     TA         17.762332

[392 rows x 2 columns]


In [10]:
total_value = temp_df.Temperature_Mean.sum()
jumlah_data = len(temp_df.Region)
print(f"Total populasi value temperature keseluruhan (T2.. untuk pengurang di SST dan SSK): {total_value} | N (untuk pengurang di SST dan SSK): {jumlah_data }")

Total populasi value temperature keseluruhan (T2.. untuk pengurang di SST dan SSK): 8466.16692191152 | N (untuk pengurang di SST dan SSK): 392


In [16]:
temp_pivot = temp_df.pivot(columns='Region', values='Temperature_Mean')
temp_pivot_reset = temp_pivot.reset_index(drop=True)

temp_pivot_reset

Region,NSW,NT,QL,SA,TA,VI,WA
0,,,,,,,23.390953
1,,,,,,,23.552620
2,,,,,,,14.617797
3,,,,,,,25.227217
4,,,,,,,19.427573
...,...,...,...,...,...,...,...
387,,,,,8.267706,,
388,,,,,19.759433,,
389,,,,,10.675490,,
390,,,,,9.040038,,


In [17]:
column_sums = temp_pivot_reset.sum()

# Count non-NaN values for each column
non_nan_counts = temp_pivot_reset.count()
print("Sum of each column (T2..untuk SSK):")
print(column_sums)

print("\nCount of non-NaN values for each column (n untuk SSK):")
print(non_nan_counts)

Sum of each column (T2..untuk SSK):
Region
NSW    1325.268599
NT     1870.717897
QL     1932.234205
SA      698.091247
TA      268.416643
VI      473.657843
WA     1897.780487
dtype: float64

Count of non-NaN values for each column (n untuk SSK):
Region
NSW    69
NT     75
QL     82
SA     29
TA     21
VI     32
WA     84
dtype: int64


In [18]:
model = ols('Temperature_Mean ~ (Region)', data=temp_df).fit()
temp_anova_table = sm.stats.anova_lm(model, typ=1)
temp_anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
Region,6.0,4921.443313,820.240552,28.580728,2.908602e-28
Residual,385.0,11049.145337,28.699079,,


In [19]:
from scipy.stats import f
alfa = 0.05
k = len(temp_df.Region.unique())
N = len(temp_df)
v1 = k-1
v2 = N-k
v_tot = v1+v2
SSK = temp_anova_table['sum_sq']['Region']
SSE = temp_anova_table['sum_sq']['Residual']
SST = SSK+SSE
MSK = SSK/v1
MSE = SSE/v2
F_hitung = MSK/MSE
F_table = f.ppf(1 - alfa, v1, v2)
sum_value = temp_df.Temperature_Mean.sum()

temp_data = {
    'SUMBER VARIANSI': ['Perlakuan', 'Error', 'TOTAL'],
    'SS': [SSK, SSE, SST],
    'v': [v1, v2, v_tot],
    'MS': [MSK, MSE, None],
    'Fhitung': [F_hitung, None, None]
}

# Create a DataFrame
temp_anova_final = pd.DataFrame(temp_data )

# Display the DataFrame
print(temp_anova_final)
print(f"Diketahui: ")
print(f"Total Sum / Sigma Xi = {sum_value}")
print(f"alfa: {alfa}")
print(f"F hitung = {MSK}/{MSE} = {F_hitung}")
print(f"F tabel = F({alfa};{v1};{v2}) = {F_table}")
if F_hitung > F_table:
    print("Maka h0 ditolak, artinya ada minimal 1 region yang berbeda nilainya")
elif F_hitung < F_table:
    print("Maka h0 diterima, artinya semua region sama nilainya")

  SUMBER VARIANSI            SS    v          MS    Fhitung
0       Perlakuan   4921.443313    6  820.240552  28.580728
1           Error  11049.145337  385   28.699079        NaN
2           TOTAL  15970.588650  391         NaN        NaN
Diketahui: 
Total Sum / Sigma Xi = 8466.16692191152
alfa: 0.05
F hitung = 820.2405521582174/28.699078796584924 = 28.58072755477513
F tabel = F(0.05;6;385) = 2.1221371769660387
Maka h0 ditolak, artinya ada minimal 1 region yang berbeda nilainya


In [26]:
N = len(temp_df)
print(N)

392


# Sample WindSpeed


In [20]:
windspd_df = df[['Region','WindSpeed_Mean']]
print(windspd_df)

    Region  WindSpeed_Mean
0       WA        3.551176
1       WA        4.091700
2       WA        4.637551
3       WA        4.334514
4       WA        4.840414
..     ...             ...
387     TA        1.748738
388     TA        3.482381
389     TA        2.851847
390     TA        3.716907
391     TA        2.908519

[392 rows x 2 columns]


In [21]:
total_value = windspd_df.WindSpeed_Mean.sum()
jumlah_data = len(windspd_df.WindSpeed_Mean)
print(f"Total populasi value temperature keseluruhan (T2.. untuk pengurang di SST dan SSK): {total_value} | N (untuk pengurang di SST dan SSK): {jumlah_data }")

Total populasi value temperature keseluruhan (T2.. untuk pengurang di SST dan SSK): 1463.8010376058783 | N (untuk pengurang di SST dan SSK): 392


In [23]:
windspd_pivot = windspd_df.pivot(columns='Region', values='WindSpeed_Mean')
windspd_pivot_reset  = windspd_pivot.reset_index(drop=True)
windspd_pivot_reset

Region,NSW,NT,QL,SA,TA,VI,WA
0,,,,,,,3.551176
1,,,,,,,4.091700
2,,,,,,,4.637551
3,,,,,,,4.334514
4,,,,,,,4.840414
...,...,...,...,...,...,...,...
387,,,,,1.748738,,
388,,,,,3.482381,,
389,,,,,2.851847,,
390,,,,,3.716907,,


In [28]:
column_sums = windspd_pivot_reset.sum()

# Count non-NaN values for each column
non_nan_counts = windspd_pivot_reset.count()
print("Sum of each column (T2..untuk SSK):")
print(column_sums)

print("\nCount of non-NaN values for each column (n untuk SSK):")
print(non_nan_counts)

Sum of each column (T2..untuk SSK):
Region
NSW    230.277063
NT     285.923303
QL     297.889751
SA     119.905377
TA      79.534034
VI      99.684994
WA     350.586516
dtype: float64

Count of non-NaN values for each column (n untuk SSK):
Region
NSW    69
NT     75
QL     82
SA     29
TA     21
VI     32
WA     84
dtype: int64


In [24]:
model = ols('WindSpeed_Mean ~ (Region)', data=windspd_df).fit()
windspd_anova_table = sm.stats.anova_lm(model, typ=1)
windspd_anova_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
Region,6.0,45.362306,7.560384,9.941582,3.284427e-10
Residual,385.0,292.785184,0.760481,,


In [31]:
from scipy.stats import f
alfa = 0.05
k = len(temp_df.Region.unique())
N = len(temp_df)
v1 = k-1
v2 = N-k
v_tot = v1+v2
SSK = windspd_anova_table['sum_sq']['Region']
SSE = windspd_anova_table['sum_sq']['Residual']
SST = SSK+SSE
MSK = SSK/v1
MSE = SSE/v2
F_hitung = MSK/MSE
F_table = f.ppf(1 - alfa, v1, v2)

temp_data = {
    'SUMBER VARIANSI': ['Perlakuan', 'Error', 'TOTAL'],
    'SS': [SSK, SSE, SST],
    'v': [v1, v2, v_tot],
    'MS': [MSK, MSE, None],
    'Fhitung': [F_hitung, None, None]
}

# Create a DataFrame
temp_anova_final = pd.DataFrame(temp_data )

# Display the DataFrame
print(temp_anova_final)
print(f"Diketahui: ")
print(f"alfa: {alfa}")
print(f"F hitung = {MSK}/{MSE} = {F_hitung}")
print(f"F tabel = F({alfa};{v1};{v2}) = {F_table}")
if F_hitung > F_table:
    print("Maka h0 ditolak, artinya ada minimal 1 region yang berbeda nilainya")
elif F_hitung < F_table:
    print("Maka h0 diterima, artinya semua region sama nilainya")

  SUMBER VARIANSI          SS    v        MS   Fhitung
0       Perlakuan   45.362306    6  7.560384  9.941582
1           Error  292.785184  385  0.760481       NaN
2           TOTAL  338.147490  391       NaN       NaN
Diketahui: 
alfa: 0.05
F hitung = 7.5603843066085075/0.760480998629842 = 9.941582130559535
F tabel = F(0.05;6;385) = 2.1221371769660387
Maka h0 ditolak, artinya ada minimal 1 region yang berbeda nilainya


In [39]:
print(N-k)

385
