In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
df = pd.read_csv("data.csv", sep=";")
df

In [3]:
df2 = df.melt(id_vars=["H03", "H05", "H16"],
              var_name="Date", value_name="sales")

In [4]:
df2[["Prefix", "Month", "Year"]] = df2["Date"].astype(
    "str").str.extract(r'(MO)(\d{2})(\d{4})')

In [5]:
df2

Unnamed: 0,H03,H05,H16,Date,sales,Prefix,Month,Year
0,FMP20000,"Total, gold included",Actual indices,MO011980,102.9,MO,01,1980
1,FMP20001,"Total, gold excluded",Actual indices,MO011980,55.1,MO,01,1980
2,FMP21000,Coal,Actual indices,MO011980,44.7,MO,01,1980
3,FMP23010,Iron ore,Actual indices,MO011980,37.8,MO,01,1980
4,FMP23020,Chromium,Actual indices,MO011980,24.7,MO,01,1980
...,...,...,...,...,...,...,...,...
3859,FMP23999,Other metallic minerals,Actual indices,MO122002,140.4,MO,12,2002
3860,FMP24000,Gold,Actual indices,MO122002,282.8,MO,12,2002
3861,FMP27000,Diamonds,Actual indices,MO122002,128.8,MO,12,2002
3862,FMP28888,Building materials,Actual indices,MO122002,59.8,MO,12,2002


In [6]:
df2["date"] = pd.to_datetime(df2["Year"].astype(
    str)+"-"+df2["Month"].astype(str)+"-"+"01", format="%Y-%m-%d")

In [7]:
df2

Unnamed: 0,H03,H05,H16,Date,sales,Prefix,Month,Year,date
0,FMP20000,"Total, gold included",Actual indices,MO011980,102.9,MO,01,1980,1980-01-01
1,FMP20001,"Total, gold excluded",Actual indices,MO011980,55.1,MO,01,1980,1980-01-01
2,FMP21000,Coal,Actual indices,MO011980,44.7,MO,01,1980,1980-01-01
3,FMP23010,Iron ore,Actual indices,MO011980,37.8,MO,01,1980,1980-01-01
4,FMP23020,Chromium,Actual indices,MO011980,24.7,MO,01,1980,1980-01-01
...,...,...,...,...,...,...,...,...,...
3859,FMP23999,Other metallic minerals,Actual indices,MO122002,140.4,MO,12,2002,2002-12-01
3860,FMP24000,Gold,Actual indices,MO122002,282.8,MO,12,2002,2002-12-01
3861,FMP27000,Diamonds,Actual indices,MO122002,128.8,MO,12,2002,2002-12-01
3862,FMP28888,Building materials,Actual indices,MO122002,59.8,MO,12,2002,2002-12-01


In [8]:
df2.drop(columns=["Date", "Prefix", "Month", "Year"], inplace=True)
df2

Unnamed: 0,H03,H05,H16,sales,date
0,FMP20000,"Total, gold included",Actual indices,102.9,1980-01-01
1,FMP20001,"Total, gold excluded",Actual indices,55.1,1980-01-01
2,FMP21000,Coal,Actual indices,44.7,1980-01-01
3,FMP23010,Iron ore,Actual indices,37.8,1980-01-01
4,FMP23020,Chromium,Actual indices,24.7,1980-01-01
...,...,...,...,...,...
3859,FMP23999,Other metallic minerals,Actual indices,140.4,2002-12-01
3860,FMP24000,Gold,Actual indices,282.8,2002-12-01
3861,FMP27000,Diamonds,Actual indices,128.8,2002-12-01
3862,FMP28888,Building materials,Actual indices,59.8,2002-12-01


In [9]:
df3 = pd.get_dummies(df2, columns=['date', 'H03', 'H05', 'H16'])
df3

Unnamed: 0,sales,date_1980-01-01 00:00:00,date_1980-02-01 00:00:00,date_1980-03-01 00:00:00,date_1980-04-01 00:00:00,date_1980-05-01 00:00:00,date_1980-06-01 00:00:00,date_1980-07-01 00:00:00,date_1980-08-01 00:00:00,date_1980-09-01 00:00:00,...,H05_Gold,H05_Iron ore,H05_Manganese ore,H05_Nickel,H05_Other metallic minerals,H05_Other non-metallic minerals,H05_PGMs,"H05_Total, gold excluded","H05_Total, gold included",H16_Actual indices
0,102.9,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
1,55.1,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
2,44.7,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,37.8,True,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
4,24.7,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,140.4,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
3860,282.8,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,True
3861,128.8,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3862,59.8,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [10]:
X = df3.drop('sales', axis=1)
y = df3['sales']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
degree = 2  
poly = PolynomialFeatures(degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [13]:
alpha = 1.0  
model = Ridge(alpha=alpha)
model.fit(X_train_poly, y_train)

  ret = a @ b


TypeError: solve() got an unexpected keyword argument 'sym_pos'

In [13]:
y_pred = model.predict(X_test_poly)

In [14]:
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

In [15]:
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r_squared)

Mean Absolute Error (MAE): 20.382353654592492
Mean Squared Error (MSE): 806.6100995638947
R-squared: 0.9217881518752121


In [16]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values (Linear Regression)')
plt.savefig('actual_vs_predicted.png')
plt.close()

In [17]:
with open('results.txt', 'w') as f:
    f.write(f"R2 Score: {r_squared}\n")
    f.write(f"Mean Absolute Error (MAE): {mae}\n")
    f.write(f"Mean Squared Error (MSE): {mse}\n")

In [18]:
print("Cross-validation R2 Scores:", cv_scores)


Cross-validation R2 Scores: [0.92462638 0.913236   0.92580849 0.9231589  0.92478722]
