# Setup

In [1]:
# Connect to drive.
# And get the user authentication as well!
from google.colab import drive
from google.colab import auth

auth.authenticate_user()

BASE_PATH = "/content/drive/"
drive.mount(BASE_PATH)


ModuleNotFoundError: No module named 'google'

In [None]:
# The data is usually stored in my school account.
# But sometimes I use my personal account to run the notebook.
# Unfortunately, we are no longer able to connect the Google drive of an account that differs from the account currently running the notebook.
# So, this will do for the moment.
email = !gcloud config get-value account
if email[0] == "tk240009@t.iput.ac.jp":
    DATASET_PATH = BASE_PATH + "My Drive/Year_2/Semester_2/Machine_Learning/Required/4/Data/"
else:
    DATASET_PATH = BASE_PATH + "My Drive/Shared/IPUT/Year_2/Semester_2/Machine_Learning/Required/4/Data/"


# 基本問題１ (Basic Problem 1)

In [None]:
# Base imports.
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error


In [None]:
# Fetch data.
TRAIN_STUDY_DATASET_PATH = DATASET_PATH + "train_study.csv"
TEST_STUDY_DATASET_PATH = DATASET_PATH + "test_study.csv"
df0 = pd.read_csv(TRAIN_STUDY_DATASET_PATH)
df1 = pd.read_csv(TEST_STUDY_DATASET_PATH)


In [None]:
# Confirm data.
df0.head()


Unnamed: 0,Study Hours (時間),Test Score (点)
0,10.670298,90.527115
1,6.472322,64.646967
2,10.726844,97.988427
3,8.14816,79.864747
4,7.977288,80.129456


In [None]:
df1.head()


Unnamed: 0,Study Hours
0,5
1,9
2,7
3,8
4,7


In [None]:
# Define explanatory and response variables.
X = df0['Study Hours (時間)'].to_numpy().reshape(-1, 1)
y = df0['Test Score (点)'].to_numpy().reshape(-1, 1)
test_study = df1['Study Hours'].to_numpy().reshape(-1, 1)


In [None]:
# Split the dataset into training and testing variables.
# This time, we will have the test size 10% of the original.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape


(90, 1)

In [None]:
# Fit the model with training data.
reg = LinearRegression().fit(X_train, y_train)


In [None]:
# Here we can see the response values based on the explanatory values from `test_study.csv`.
demo = reg.predict(test_study)
print('Prediction values from "test_study.csv".')
print(demo)


Prediction values from "test_study.csv".
[[65.43741584]
 [85.01285076]
 [75.2251333 ]
 [80.11899203]
 [75.2251333 ]
 [89.90670949]
 [60.54355711]
 [94.80056822]
 [65.43741584]
 [85.01285076]]


In [None]:
# Get the model predictions from the test explanatory values.
y_pred = reg.predict(X_test)

# Evaluation based on root mean squared error (RMSE).
err = root_mean_squared_error(y_test, y_pred)
print("RMSE:", err)


RMSE: 3.9799065467171806


# 基本問題２ (Basic Problem 2)

In [None]:
# Fetch data.
TRAIN_BOSTON_DATASET_PATH = DATASET_PATH + "train_boston.csv"
TEST_BOSTON_DATASET_PATH = DATASET_PATH + "test_boston.csv"
df2 = pd.read_csv(TRAIN_BOSTON_DATASET_PATH).dropna()
df3 = pd.read_csv(TEST_BOSTON_DATASET_PATH).dropna()


In [None]:
# Check the insides.
print(df2)


       CRIME    ZN  INDUS  CHAS    NOX     RM    AGE     DIS   RAD  TAX  \
0       high   0.0  18.10     0  0.718  3.561   87.9  1.6132  24.0  666   
1        low   0.0   8.14     0  0.538  5.950   82.0  3.9900   4.0  307   
2   very_low  82.5   2.03     0  0.415  6.162   38.4  6.2700   2.0  348   
3        low   0.0  21.89     0  0.624  6.151   97.9  1.6687   4.0  437   
4       high   0.0  18.10     0  0.614  6.980   67.6  2.5329  24.0  666   
..       ...   ...    ...   ...    ...    ...    ...     ...   ...  ...   
95      high   0.0  18.10     0  0.740  6.219  100.0  2.0048  24.0  666   
96      high   0.0  18.10     0  0.655  5.759   48.2  3.0665  24.0  666   
97      high   0.0  18.10     0  0.671  6.380   96.2  1.3861  24.0  666   
98       low   0.0   9.90     0  0.544  5.914   83.2  3.9986   4.0  304   
99      high   0.0  18.10     0  0.693  5.453  100.0  1.4896  24.0  666   

    PTRATIO       B  LSTAT  PRICE  
0      20.2  354.70   7.12   27.5  
1      21.0  232.60  27.71 

In [None]:
# Should not have the "PRICE" column.
print(df3)


      CRIME    ZN  INDUS  CHAS    NOX     RM   AGE   DIS  RAD  TAX  PTRATIO  \
0      high  14.7   18.0     0  0.623  6.352  55.2  4.23    5  330     16.8   
1       low  25.1    4.5     1  0.437  7.126  62.8  5.67    3  287     15.2   
2  very_low  92.0   11.9     0  0.589  5.989  73.4  8.15    6  432     18.7   
3       low   0.0    6.3     0  0.485  4.958  24.1  3.92    4  250     12.1   
4      high  18.3    2.7     1  0.711  6.671  88.5  7.65    2  621     20.1   
5       low  34.8    7.1     0  0.399  5.254  45.6  6.14    8  300     14.7   
6      high   0.0   21.3     1  0.742  4.563  62.1  5.13   24  566     19.8   
7  very_low  61.5   14.2     0  0.532  7.012  39.7  9.72    7  348     13.2   
8       low  11.8    9.6     1  0.412  5.485  54.9  2.88   10  210     17.1   
9      high  43.9    3.1     0  0.623  6.214  85.4  4.32   13  492     16.9   

        B  LSTAT  
0  293.14  11.72  
1  392.61   6.89  
2  381.45  10.23  
3  376.15  18.44  
4  395.23  22.67  
5  368.17   8.23

###### The following was written by referencing: [Multiple Linear Regression With scikit-learn](https://www.geeksforgeeks.org/machine-learning/multiple-linear-regression-with-scikit-learn/) (Accessed 20 October, 2025).

In [None]:
# Get the explanation and response variables.
X_boston = df2.drop("PRICE", axis=1)
y_boston = df2["PRICE"]


In [None]:
# "Encode" the `CRIME` column of the Boston tabular dataset.
mapping = {
    "very_low": 0,
    "low": 1,
    "high": 2,
    "very_high": 4
}
X_boston["CRIME"] = X_boston["CRIME"].map(mapping)
df3["CRIME"] = df3["CRIME"].map(mapping)


In [None]:
# Confirm the changes.
X_boston.head()


Unnamed: 0,CRIME,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,2,0.0,18.1,0,0.718,3.561,87.9,1.6132,24.0,666,20.2,354.7,7.12
1,1,0.0,8.14,0,0.538,5.95,82.0,3.99,4.0,307,21.0,232.6,27.71
2,0,82.5,2.03,0,0.415,6.162,38.4,6.27,2.0,348,14.7,393.77,7.43
3,1,0.0,21.89,0,0.624,6.151,97.9,1.6687,4.0,437,21.2,396.9,18.46
4,2,0.0,18.1,0,0.614,6.98,67.6,2.5329,24.0,666,20.2,374.68,11.66


In [None]:
# Confirm that the response only contains the target values.
y_boston.head()


Unnamed: 0,PRICE
0,27.5
1,13.2
2,24.1
3,17.8
4,29.8


In [None]:
# Split the dataset.
X_boston_train, X_boston_test, y_boston_train, y_boston_test = \
    train_test_split(X_boston, y_boston, test_size=0.2, random_state=42)


In [None]:
# Fit the regression model on the training data.
reg_boston = LinearRegression().fit(X_boston_train, y_boston_train)


In [None]:
# Get the prediction values from the test dataset.
test_boston_pred = reg_boston.predict(df3)
print('Prediction values from "test_boston.csv".')
print(test_boston_pred)


Prediction values from "test_boston.csv".
[25.52291179 28.81566441 13.15408444 25.02779407  0.76874821 27.32376524
  9.79340629 19.55687022 28.61229908 21.56030246]


In [None]:
# Make the prediction on test explanation dataset.
y_boston_pred = reg_boston.predict(X_boston_test)

# Evaluate using the root mean squeared error function.
eval = root_mean_squared_error(y_test, y_pred)

# Report loss function value.
print(f"RMSE: {eval}")


RMSE: 3.9799065467171806


# 応⽤問題 (Exercise Question)

In [None]:
# Base imports
import csv
import copy
import math
import random
from tabulate import tabulate


In [None]:
class MyDataFrame():
    """Poor man's `pandas.DataFrame` by yours truly."""

    def __init__(self, data):
        """`data` should be a type dict.
        And this assumes the key:value is like: dict[str, list[Any]]
        """
        self.data = data

    def __getitem__(self, key):
        """Returns separate instance only with value accessed by key of `data`."""
        return MyDataFrame({key: self.data[key]})

    def __delitem__(self, key):
        """Deletes the value accessed by key of `data`."""
        del self.data[key]

    def __len__(self):
        """Beware: This assumes all columns contain the same length of data.
        As such, it only counts the length of the first value.
        """
        return len(list(self.data.values())[0])

    def __add__(self, other):
        """Retuns a new combined instance of the given two MyDataFrames."""
        if self.data.keys() != other.data.keys():
            raise Exception("Cannot add differing headers together!")

        combined = self.copy()
        for key, value in other.data.items():
            combined.data[key].extend(value)

        return combined

    def extend(self, other):
        """Extends or concatenates and returns a new instance."""
        if len(self) != len(other):
            raise Exception("The lengths are not the same!")

        extended = self.copy()
        for key, value in other.data.items():
            if key in list(extended.data.keys()):
                raise Exception(f"Same key ({key}) detected! What to do?!?")

            extended.data[key] = value

        return extended

    def tabulate(self, tablefmt = "simple"):
        """Tabulates the data so it is not an eyesore to look at."""
        if not self.data:
            return

        print(tabulate(self.data, headers="keys", tablefmt=tablefmt))

    def copy(self):
        """Retuns a deep copy of self."""
        return copy.deepcopy(self)

    def drop(self, key):
        """Deletes the column accessed by key of `data`."""
        other = self.copy()
        del other[key]
        return other

    def dropna(self):
        """Deletes all columns that have a `None` in them."""
        other = self.copy()
        for key, value in self.data.items():
            if None in value:
                del other[key]
        else:
            return other

    def row(self, i, as_list = False):
        """Returns a MyDataFrame instance for the indexing row."""
        i_data = dict()
        for key, value in self.data.items():
            i_data[key] = [value[i]] if as_list else value[i]

        return MyDataFrame(i_data)

    def to_array(self, key = None):
        """Return the list value.
        If key is not specified the a random column will be selected.
        """
        if not key:
            key = list(self.data.keys())[0]

        return self.data[key]


In [None]:
# Define helper function.
def read_csv(filepath):
    """Reads the csv and returns a dict."""
    data = dict()
    with open(filepath) as f:
        reader = csv.DictReader(f)
        if reader.fieldnames:
            for fieldname in reader.fieldnames:
                data[fieldname] = []

        for row in reader:
            for key, value in row.items():
                try:
                    data[key].append(None if value == "" else float(value))
                except ValueError:
                    data[key].append(None if value == "" else value)

    return data


In [None]:
df4 = MyDataFrame(read_csv(TRAIN_STUDY_DATASET_PATH))
df5 = MyDataFrame(read_csv(TEST_STUDY_DATASET_PATH))


In [None]:
df4.tabulate()


  Study Hours (時間)    Test Score (点)
--------------------  -----------------
            10.6703             90.5271
             6.47232            64.647
            10.7268             97.9884
             8.14816            79.8647
             7.97729            80.1295
             3.16089            56.7477
            10.7627             94.8603
             1.0623             43.4393
             3.52982            62.4226
             5.34792            69.3558
             8.79383            81.4901
             2.97685            54.0269
             9.62993            83.4278
            10.834              95.5744
             2.63842            56.8833
             6.97334            78.1205
             1.08986            48.523
             4.86571            63.6957
             1.4416             55.1024
            10.5665             91.7967
             5.36147            66.9577
            10.4898             90.768
             8.86306            84.7216
      

In [None]:
# Chain method calls.
# First make a deep copy.
# Then, remove all columns with a `None` value in them.
# We should drop the columns we know we don't need as to
# not increase processing time for `dropna`.
# Although with this small size it probably doesn't matter...
X_study1 = df4.copy().drop("Test Score (点)").dropna()

# As with `pandas.DataFrame`, extracting a value with the key will give us
# a separate `MyDataFrame` instance that only includes said key:value pair.
y_study1 = df4["Test Score (点)"]


In [None]:
def my_train_test_split(*arrays, test_size = 0.2, seed = None):
    """For this case arrays will only take in types of `MyDataFrame`."""
    if seed:
        random.seed(seed)

    return_values = []
    for array in arrays:
        train = MyDataFrame({key: [] for key in array.data.keys()})
        test = MyDataFrame({key: [] for key in array.data.keys()})
        array_len = len(array)
        test_len = math.ceil(array_len * test_size)
        # Beware: Do not use `random.choices` here!
        # `random.choices` can repeat the same selections.
        test_indices = random.sample(range(array_len), k=test_len)
        for i in range(array_len):
            if i in test_indices:
                test += array.row(i, as_list=True)
            else:
                train += array.row(i, as_list=True)

        return_values.extend([train, test])

    return return_values


In [None]:
# Split the dataset using my own splitter.
X_study1_train, X_study1_test, y_study1_train, y_study1_test = my_train_test_split(X_study1, y_study1, seed=42)


In [None]:
class MyLinearRegression:
    """Poor man's simple linear regression (SLR) by your's truly."""

    def __init__(self):
        """Just initializes the coeffecients of the linear function."""
        self.a = None
        self.b = None

    def fit(self, X, y):
        """Fit the model according to the given parameters."""
        size = len(X)
        X_avg = sum(X) / size
        print("X avg:", X_avg)
        y_avg = sum(y) / len(y)
        print("y avg:", y_avg)

        X_deviations = [X[i] - X_avg for i in range(size)]
        covariance = sum([X_deviations[i] * (y[i] - y_avg) for i in range(size)])
        print("covariance:", covariance)
        variance = sum([X_deviations[i] ** 2 for i in range(size)])
        print("variance:", variance)

        self.a = covariance / variance
        print("a:", self.a)
        self.b = y_avg - self.a * X_avg
        print("b:", self.b)

    def predict(self, X):
        """Give a prediction given the explanatory value(s)."""
        size = len(X)
        if size == 0:
            raise Exception("No explanatory values!")
        elif size == 1:
            return self.a * X + self.b
        else:
            return [self.a * X[i] + self.b for i in range(size)]


In [None]:
# MAE is another loss function alongside RMSE that is used in machine learning.
# The previous problems were calculated with RMSE, lets use MAE in this one.

# As far as I am concerned:
# Penalize large errors?
# YES: RMSE
# NO: MAE

def my_mean_absolute_error(y_true, y_pred):
    """Your standard MSA."""

    y_true_len = len(y_true)
    if y_true_len != len(y_pred):
        raise Exception("They are not the same length!")

    return sum([
        abs(y_true[i] - y_pred[i])
        for i in range(y_true_len)
    ]) / y_true_len


In [None]:
my_reg = MyLinearRegression()
my_reg.fit(X_study1_train.to_array(), y_study1_train.to_array())
y_pred = my_reg.predict(X_study1_test.to_array())


X avg: 6.183099714939807
y avg: 72.06976227944794
covariance: 206.5506065572849
variance: 604.8885279311422
a: 0.3414688773545358
b: 69.9584261612163


In [None]:
# For debugging purposes...
with open("/content/output.csv", "w", newline="") as f:
    temp = X_study1_train.extend(y_study1_train)
    writer = csv.DictWriter(f, list(temp.data.keys()))
    writer.writeheader()
    writer.writerows([temp.row(i).data for i in range(len(temp))])


In [None]:
# Get the prediction values from the "test_study.csv" dataset.
test_study_pred = my_reg.predict(df5.to_array())
print('Prediction values from "test_study.csv".')
print(test_study_pred)


Prediction values from "test_study.csv".
[71.66577054798897, 73.03164605740712, 72.34870830269804, 72.69017718005259, 72.34870830269804, 73.37311493476166, 71.32430167063444, 73.71458381211619, 71.66577054798897, 73.03164605740712]


In [None]:
print("Mean Absolute Error:",
    my_mean_absolute_error(y_study1_test.to_array(), y_pred))


Mean Absolute Error: 14.619802315101012
