<a href="https://colab.research.google.com/github/Imama-Kainat/ByteWise-MLDL/blob/main/fasrassignipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
!pip install python-docx
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from docx import Document



In [69]:

# Define the path to folder containing .docx files
folder_path = '/content/drive/MyDrive/datu'

# Function to read a .docx file
def read_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Initialize an empty DataFrame to store all the data
all_data = pd.DataFrame()

# Iterate through all files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.docx'):  # Process only .docx files
        file_path = os.path.join(folder_path, file_name)
        try:
            # Extract text from the .docx file
            docx_text = read_docx(file_path)

            # Parse the JSON data from the extracted text
            json_data = json.loads(docx_text)

            # Convert JSON data to a DataFrame
            df = pd.json_normalize(json_data)

            # Concatenate the new DataFrame with the existing data
            all_data = pd.concat([all_data, df], ignore_index=True)

        except json.JSONDecodeError:
            print(f"Error decoding JSON in file: {file_name}. Please ensure the data is properly formatted.")
        except Exception as e:
            print(f"An error occurred while processing file: {file_name}. Error: {e}")




In [70]:
# Display the concatenated DataFrame
print(all_data)

# Show DataFrame properties
print("\nDataFrame Info:")
print(all_data.info())

print("\nDataFrame Summary:")
print(all_data.describe(include='all'))

            type  status departure.iataCode departure.icaoCode  \
0      departure  active                lhe               opla   
1      departure  active                lhe               opla   
2      departure  active                lhe               opla   
3      departure  active                lhe               opla   
4      departure  active                lhe               opla   
...          ...     ...                ...                ...   
81387  departure  active                isb               opis   
81388  departure  active                isb               opis   
81389  departure  active                isb               opis   
81390  departure  active                isb               opis   
81391  departure  active                isb               opis   

      departure.terminal  departure.delay  departure.scheduledTime  \
0                      m             20.0  2023-12-16t06:35:00.000   
1                      m             20.0  2023-12-16t06:35:00.000 

In [71]:
all_data.dtypes

Unnamed: 0,0
type,object
status,object
departure.iataCode,object
departure.icaoCode,object
departure.terminal,object
departure.delay,float64
departure.scheduledTime,object
departure.estimatedTime,object
departure.actualTime,object
departure.estimatedRunway,object


In [72]:

print(all_data.isnull().sum())


type                                0
status                              0
departure.iataCode                  0
departure.icaoCode                  0
departure.terminal              34831
departure.delay                 28867
departure.scheduledTime             0
departure.estimatedTime         11024
departure.actualTime            33130
departure.estimatedRunway       33130
departure.actualRunway          33130
arrival.iataCode                    0
arrival.icaoCode                    0
arrival.scheduledTime               0
arrival.estimatedTime           34468
airline.name                        0
airline.iataCode                    0
airline.icaoCode                    0
flight.number                       0
flight.iataNumber                   0
flight.icaoNumber                   0
codeshared.airline.name         56312
codeshared.airline.iataCode     56312
codeshared.airline.icaoCode     56312
codeshared.flight.number        56312
codeshared.flight.iataNumber    56312
codeshared.f

In [73]:

# Remove rows where 'departure.delay' is null or 'status' is not 'active'
all_data = all_data.dropna(subset=['departure.delay'])
all_data = all_data[all_data['status'] == 'active']

# Get columns with more than 40000 null values
cols_to_drop = all_data.columns[all_data.isnull().sum() > 40000]

# Drop those columns
all_data = all_data.drop(columns=cols_to_drop)

# Reset the index
all_data = all_data.reset_index(drop=True)

# Display the modified DataFrame
print(all_data)


            type  status departure.iataCode departure.icaoCode  \
0      departure  active                lhe               opla   
1      departure  active                lhe               opla   
2      departure  active                lhe               opla   
3      departure  active                lhe               opla   
4      departure  active                lhe               opla   
...          ...     ...                ...                ...   
52349  departure  active                isb               opis   
52350  departure  active                isb               opis   
52351  departure  active                isb               opis   
52352  departure  active                isb               opis   
52353  departure  active                isb               opis   

      departure.terminal  departure.delay  departure.scheduledTime  \
0                      m             20.0  2023-12-16t06:35:00.000   
1                      m             20.0  2023-12-16t06:35:00.000 

In [74]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52354 entries, 0 to 52353
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   type                          52354 non-null  object 
 1   status                        52354 non-null  object 
 2   departure.iataCode            52354 non-null  object 
 3   departure.icaoCode            52354 non-null  object 
 4   departure.terminal            32293 non-null  object 
 5   departure.delay               52354 non-null  float64
 6   departure.scheduledTime       52354 non-null  object 
 7   departure.estimatedTime       50878 non-null  object 
 8   departure.actualTime          41485 non-null  object 
 9   departure.estimatedRunway     41485 non-null  object 
 10  departure.actualRunway        41485 non-null  object 
 11  arrival.iataCode              52354 non-null  object 
 12  arrival.icaoCode              52354 non-null  object 
 13  a

In [75]:

# Bin the 'departure.delay' column into 8 bins
all_data['departure_delay_bin'] = pd.cut(all_data['departure.delay'], bins=8)

# Display the modified DataFrame
print(all_data)


            type  status departure.iataCode departure.icaoCode  \
0      departure  active                lhe               opla   
1      departure  active                lhe               opla   
2      departure  active                lhe               opla   
3      departure  active                lhe               opla   
4      departure  active                lhe               opla   
...          ...     ...                ...                ...   
52349  departure  active                isb               opis   
52350  departure  active                isb               opis   
52351  departure  active                isb               opis   
52352  departure  active                isb               opis   
52353  departure  active                isb               opis   

      departure.terminal  departure.delay  departure.scheduledTime  \
0                      m             20.0  2023-12-16t06:35:00.000   
1                      m             20.0  2023-12-16t06:35:00.000 

In [76]:

# Convert 'departure.scheduledTime' to datetime objects
all_data['departure.scheduledTime'] = pd.to_datetime(all_data['departure.scheduledTime'])

# Extract year, month, day, and time
all_data['departure_year'] = all_data['departure.scheduledTime'].dt.year
all_data['departure_month'] = all_data['departure.scheduledTime'].dt.month
all_data['departure_day'] = all_data['departure.scheduledTime'].dt.day
all_data['scheduled_d_Time'] = all_data['departure.scheduledTime'].dt.strftime('%H%M%S')  # Time in HHMMSS format

# Display the modified DataFrame
print(all_data[['departure.scheduledTime', 'departure_year', 'departure_month', 'departure_day', 'scheduled_d_Time']])


      departure.scheduledTime  departure_year  departure_month  departure_day  \
0         2023-12-16 06:35:00            2023               12             16   
1         2023-12-16 06:35:00            2023               12             16   
2         2023-12-16 09:00:00            2023               12             16   
3         2023-12-16 11:00:00            2023               12             16   
4         2023-12-16 11:25:00            2023               12             16   
...                       ...             ...              ...            ...   
52349     2024-06-01 03:35:00            2024                6              1   
52350     2024-06-01 03:35:00            2024                6              1   
52351     2024-06-01 04:15:00            2024                6              1   
52352     2024-06-01 04:30:00            2024                6              1   
52353     2024-06-01 04:30:00            2024                6              1   

      scheduled_d_Time  
0 

In [77]:

# Extract time from 'departure.estimatedTime'
all_data['departure.estimatedTime'] = pd.to_datetime(all_data['departure.estimatedTime']).dt.strftime('%H%M%S')

# Extract time from 'arrival.estimatedTime'
all_data['arrival.scheduledTime'] = pd.to_datetime(all_data['arrival.scheduledTime']).dt.strftime('%H%M%S')

# Extract time from 'departure.actualTime'
all_data['departure.actualTime'] = pd.to_datetime(all_data['departure.actualTime']).dt.strftime('%H%M%S')

# Extract time from 'arrival.actualTime'


In [78]:


all_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52354 entries, 0 to 52353
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   type                          52354 non-null  object        
 1   status                        52354 non-null  object        
 2   departure.iataCode            52354 non-null  object        
 3   departure.icaoCode            52354 non-null  object        
 4   departure.terminal            32293 non-null  object        
 5   departure.delay               52354 non-null  float64       
 6   departure.scheduledTime       52354 non-null  datetime64[ns]
 7   departure.estimatedTime       50878 non-null  object        
 8   departure.actualTime          41485 non-null  object        
 9   departure.estimatedRunway     41485 non-null  object        
 10  departure.actualRunway        41485 non-null  object        
 11  arrival.iataCode            

In [79]:
all_data.head(10)

Unnamed: 0,type,status,departure.iataCode,departure.icaoCode,departure.terminal,departure.delay,departure.scheduledTime,departure.estimatedTime,departure.actualTime,departure.estimatedRunway,...,codeshared.flight.number,codeshared.flight.iataNumber,codeshared.flight.icaoNumber,arrival.baggage,arrival.terminal,departure_delay_bin,departure_year,departure_month,departure_day,scheduled_d_Time
0,departure,active,lhe,opla,m,20.0,2023-12-16 06:35:00,64500,65500.0,2023-12-16t06:55:00.000,...,715.0,tk715,thy715,,,"(-0.949, 244.625]",2023,12,16,63500
1,departure,active,lhe,opla,m,20.0,2023-12-16 06:35:00,64500,65500.0,2023-12-16t06:55:00.000,...,,,,,,"(-0.949, 244.625]",2023,12,16,63500
2,departure,active,lhe,opla,m,44.0,2023-12-16 09:00:00,93000,94300.0,2023-12-16t09:43:00.000,...,,,,,m,"(-0.949, 244.625]",2023,12,16,90000
3,departure,active,lhe,opla,m,360.0,2023-12-16 11:00:00,170000,,,...,,,,,m,"(244.625, 488.25]",2023,12,16,110000
4,departure,active,lhe,opla,m,60.0,2023-12-16 11:25:00,122500,,,...,,,,1.0,1,"(-0.949, 244.625]",2023,12,16,112500
5,departure,active,lhe,opla,main,30.0,2023-12-16 12:20:00,125000,,,...,625.0,ek625,uae625,3.0,3,"(-0.949, 244.625]",2023,12,16,122000
6,departure,active,lhe,opla,main,30.0,2023-12-16 12:20:00,125000,,,...,,,,3.0,3,"(-0.949, 244.625]",2023,12,16,122000
7,departure,active,lhe,opla,m,10.0,2023-12-16 13:00:00,130000,131000.0,2023-12-16t13:10:00.000,...,,,,5.0,1,"(-0.949, 244.625]",2023,12,16,130000
8,departure,active,lhe,opla,m,3.0,2023-12-16 15:00:00,150000,150300.0,2023-12-16t15:03:00.000,...,,,,,,"(-0.949, 244.625]",2023,12,16,150000
9,departure,active,lhe,opla,,2.0,2023-12-16 15:05:00,150500,150700.0,2023-12-16t15:07:00.000,...,,,,,,"(-0.949, 244.625]",2023,12,16,150500


In [80]:
all_data.describe()

Unnamed: 0,departure.delay,departure.scheduledTime,departure_year,departure_month,departure_day
count,52354.0,52354,52354.0,52354.0,52354.0
mean,33.804466,2024-01-11 19:29:15.777591296,2023.522367,6.599018,15.823184
min,1.0,2023-07-13 05:05:00,2023.0,1.0,1.0
25%,9.0,2023-10-11 16:31:15,2023.0,4.0,8.0
50%,16.0,2024-01-08 16:10:00,2024.0,7.0,16.0
75%,29.0,2024-04-14 20:58:45,2024.0,10.0,24.0
max,1950.0,2024-07-10 04:55:00,2024.0,12.0,31.0
std,73.390865,,0.499504,3.449401,8.865619


In [81]:

selected_columns = [
    'departure.iataCode',
    'departure_year',
    'departure_month',
    'departure_day',
    'scheduled_d_Time',
    'departure.estimatedTime',
    'arrival.iataCode',
    'arrival.scheduledTime',
    'airline.name',
    'flight.number',
    'departure_delay_bin'

]

# Keep only the selected columns in the DataFrame
all_data = all_data[selected_columns]


In [82]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52354 entries, 0 to 52353
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   departure.iataCode       52354 non-null  object  
 1   departure_year           52354 non-null  int32   
 2   departure_month          52354 non-null  int32   
 3   departure_day            52354 non-null  int32   
 4   scheduled_d_Time         52354 non-null  object  
 5   departure.estimatedTime  50878 non-null  object  
 6   arrival.iataCode         52354 non-null  object  
 7   arrival.scheduledTime    52354 non-null  object  
 8   airline.name             52354 non-null  object  
 9   flight.number            52354 non-null  object  
 10  departure_delay_bin      52354 non-null  category
dtypes: category(1), int32(3), object(7)
memory usage: 3.4+ MB


In [83]:
all_data.head(10)

Unnamed: 0,departure.iataCode,departure_year,departure_month,departure_day,scheduled_d_Time,departure.estimatedTime,arrival.iataCode,arrival.scheduledTime,airline.name,flight.number,departure_delay_bin
0,lhe,2023,12,16,63500,64500,ist,113500,pakistan international airlines,5715,"(-0.949, 244.625]"
1,lhe,2023,12,16,63500,64500,ist,113500,turkish airlines,715,"(-0.949, 244.625]"
2,lhe,2023,12,16,90000,93000,khi,110000,airblue,401,"(-0.949, 244.625]"
3,lhe,2023,12,16,110000,170000,khi,184500,pakistan international airlines,303,"(244.625, 488.25]"
4,lhe,2023,12,16,112500,122500,dxb,140000,serene air,723,"(-0.949, 244.625]"
5,lhe,2023,12,16,122000,125000,dxb,145000,air canada,7579,"(-0.949, 244.625]"
6,lhe,2023,12,16,122000,125000,dxb,145000,emirates,625,"(-0.949, 244.625]"
7,lhe,2023,12,16,130000,130000,dxb,155500,airblue,410,"(-0.949, 244.625]"
8,lhe,2023,12,16,150000,150000,mct,171500,oman air,344,"(-0.949, 244.625]"
9,lhe,2023,12,16,150500,150500,khi,165500,flyjinnah,843,"(-0.949, 244.625]"


In [84]:
all_data=all_data.drop_duplicates()


In [85]:
class MultiClassSVM:
    def __init__(self, C=1.0, kernel='linear', degree=3, gamma='scale'):
        self.C = C
        self.kernel = kernel
        self.degree = degree
        self.gamma = gamma
        self.svm_models = []

    def fit(self, X, y):
        unique_classes = np.unique(y)
        for cls in unique_classes:
            # Create a binary label for current class vs all others
            binary_y = np.where(y == cls, 1, -1)
            svm = MultiKernelSVM(C=self.C, kernel=self.kernel, degree=self.degree, gamma=self.gamma)
            svm.fit(X, binary_y)
            self.svm_models.append(svm)

    def predict(self, X):
        # Get predictions for each binary classifier
        predictions = np.array([svm.predict(X) for svm in self.svm_models])
        # Choose the class with the highest decision function value
        return np.argmax(predictions, axis=0)


In [86]:
class MultiKernelSVM:
    def __init__(self, C=1.0, kernel='linear', degree=3, gamma='scale'):
        self.C = C
        self.kernel = kernel
        self.degree = degree
        self.gamma = gamma
        self.alphas = None
        self.b = None
        self.support_vectors = None
        self.y_support = None

    def _kernel(self, X, Y):
        if self.kernel == 'linear':
            return np.dot(X, Y.T)
        elif self.kernel == 'polynomial':
            return (np.dot(X, Y.T) + 1) ** self.degree
        elif self.kernel == 'rbf':
            if self.gamma == 'scale':
                self.gamma = 1.0 / X.shape[1]
            elif self.gamma == 'auto':
                self.gamma = 1.0 / X.shape[0]
            sq_dists = np.sum(X**2, 1).reshape(-1, 1) + np.sum(Y**2, 1) - 2 * np.dot(X, Y.T)
            return np.exp(-self.gamma * sq_dists)
        else:
            raise ValueError("Unsupported kernel")

    def fit(self, X, y, num_iter=100):
        num_samples, num_features = X.shape
        K = self._kernel(X, X)

        self.alphas = np.zeros(num_samples)
        self.b = 0

        for _ in range(num_iter):
            for i in range(num_samples):
                condition = y[i] * (np.dot(self.alphas * y, K[i]) + self.b)
                if condition < 1:
                    self.alphas[i] += self.C * (1 - condition)
                    self.b += self.C * y[i]

        self.support_vectors = X
        self.y_support = y

    def predict(self, X):
        K = self._kernel(X, self.support_vectors)
        predictions = np.dot(K, self.alphas * self.y_support) + self.b
        return np.sign(predictions)


In [87]:
class DecisionTreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTree:
    def __init__(self, max_depth=10):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.tree = self._fit(X, y, depth=0)

    def _fit(self, X, y, depth):
        if len(set(y)) == 1:
            return DecisionTreeNode(value=np.bincount(y).argmax())
        if depth >= self.max_depth:
            return DecisionTreeNode(value=np.bincount(y).argmax())

        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return DecisionTreeNode(value=np.bincount(y).argmax())

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left_node = self._fit(X[left_indices], y[left_indices], depth + 1)
        right_node = self._fit(X[right_indices], y[right_indices], depth + 1)

        return DecisionTreeNode(feature=best_feature, threshold=best_threshold, left=left_node, right=right_node)

    def _find_best_split(self, X, y):
        best_feature, best_threshold = None, None
        best_gain = -1

        num_features = X.shape[1]
        for feature in range(num_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, X, y, feature, threshold):
        left_indices = X[:, feature] <= threshold
        right_indices = X[:, feature] > threshold

        left_y, right_y = y[left_indices], y[right_indices]
        p_left = len(left_y) / len(y)
        p_right = len(right_y) / len(y)

        entropy_before = self._entropy(y)
        entropy_after = (p_left * self._entropy(left_y)) + (p_right * self._entropy(right_y))

        return entropy_before - entropy_after

    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)
        return -np.sum(proportions * np.log2(proportions + 1e-10))  # Added epsilon to avoid log(0)

    def predict(self, X):
        return np.array([self._predict(x, self.tree) for x in X])

    def _predict(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)


In [88]:

class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def fit(self, X, y):
        # Find the unique classes and map them to consecutive integers
        unique_classes = np.unique(y)
        self.class_mapping = {c: i for i, c in enumerate(unique_classes)}

        # Map the original y values to the new consecutive integer classes
        y = np.array([self.class_mapping[c] for c in y])

        self.num_classes = len(unique_classes)  # Ensure this reflects the number of unique classes
        self.theta = np.zeros((X.shape[1], self.num_classes))

        for _ in range(self.num_iterations):
            predictions = self.softmax(np.dot(X, self.theta))
            y_one_hot = np.eye(self.num_classes)[y]
            error = y_one_hot - predictions
            gradient = np.dot(X.T, error) / len(y)
            self.theta += self.learning_rate * gradient

    def predict(self, X):
        predictions = self.softmax(np.dot(X, self.theta))
        # Reverse the class mapping to get original class labels
        predicted_classes = np.argmax(predictions, axis=1)
        return np.array([list(self.class_mapping.keys())[list(self.class_mapping.values()).index(c)] for c in predicted_classes])

In [89]:
print(all_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52354 entries, 0 to 52353
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   departure.iataCode       52354 non-null  object  
 1   departure_year           52354 non-null  int32   
 2   departure_month          52354 non-null  int32   
 3   departure_day            52354 non-null  int32   
 4   scheduled_d_Time         52354 non-null  object  
 5   departure.estimatedTime  50878 non-null  object  
 6   arrival.iataCode         52354 non-null  object  
 7   arrival.scheduledTime    52354 non-null  object  
 8   airline.name             52354 non-null  object  
 9   flight.number            52354 non-null  object  
 10  departure_delay_bin      52354 non-null  category
dtypes: category(1), int32(3), object(7)
memory usage: 3.4+ MB
None


In [90]:
import pandas as pd
import numpy as np

def preprocess_data(all_data):
    # Convert categorical columns to numeric codes
    categorical_columns = ['departure.iataCode', 'arrival.iataCode', 'airline.name']
    for col in categorical_columns:
        all_data[col] = all_data[col].astype('category').cat.codes

    # Convert time strings to minutes past midnight
    def convert_to_minutes(time_value):
        if isinstance(time_value, str):
            try:
                return int(time_value[:2]) * 60 + int(time_value[2:4])
            except ValueError:
                return np.nan
        elif isinstance(time_value, int):
            # Convert integer time format to string format
            time_str = f"{time_value:06d}"
            return int(time_str[:2]) * 60 + int(time_str[2:4])
        return np.nan

    time_columns = ['scheduled_d_Time', 'departure.estimatedTime', 'arrival.scheduledTime']
    for col in time_columns:
        all_data[col] = all_data[col].apply(convert_to_minutes)

    # Handle missing values
    for col in time_columns:
        all_data[col].fillna(all_data[col].median(), inplace=True)

    # Convert target variable to numerical
    all_data['departure_delay_bin'] = all_data['departure_delay_bin'].astype('category').cat.codes

    # Drop irrelevant columns
    X = all_data.drop(['departure_delay_bin', 'flight.number'], axis=1).values
    y = all_data['departure_delay_bin'].values

    return X, y
# Apply preprocessing
X, y = preprocess_data(all_data)


In [91]:
from collections import Counter

class BaggingEnsemble:
    def __init__(self, models):
        """
        Initialize the Bagging ensemble with a list of base models.
        """
        self.models = models  # A list of models like [logistic_model, tree_model, svm_model]

    def fit(self, X, y):
        """
        Fit each model to the data.
        """
        for model in self.models:
            model.fit(X, y)

    def predict(self, X):
        """
        Predict using the majority vote from all models.
        """
        predictions = np.array([model.predict(X) for model in self.models])  # Collect predictions from each model
        # For each sample, determine the majority vote (most common prediction)
        final_predictions = np.array([Counter(pred).most_common(1)[0][0] for pred in predictions.T])
        return final_predictions


In [92]:
class BoostingEnsemble:
    def __init__(self, models, num_iterations=10):
        """
        Initialize the Boosting ensemble with a list of models and iterations.
        """
        self.models = models  # List of base learners
        self.num_iterations = num_iterations
        self.alphas = []  # Store the model weights (alphas)

    def fit(self, X, y):
        """
        Fit the ensemble using boosting.
        """
        num_samples = len(y)
        # Initialize sample weights equally
        weights = np.ones(num_samples) / num_samples

        for model in self.models:
            model.fit(X, y)
            y_pred = model.predict(X)

            # Compute misclassification rate (error)
            error = np.sum(weights * (y_pred != y)) / np.sum(weights)

            # Compute the model's weight (alpha) based on error
            alpha = np.log((1 - error) / (error + 1e-10))  # Avoid divide by zero
            self.alphas.append(alpha)

            # Update the sample weights
            weights *= np.exp(alpha * (y_pred != y))  # Increase weights of misclassified examples
            weights /= np.sum(weights)  # Normalize the weights

    def predict(self, X):
        """
        Predict using a weighted majority vote.
        """
        predictions = np.zeros(X.shape[0])

        for alpha, model in zip(self.alphas, self.models):
            predictions += alpha * model.predict(X)  # Weighted prediction

        # Return the sign of the aggregated prediction
        return np.sign(predictions)

In [93]:
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

def cross_validate(model, X, y, num_folds=5):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    accuracy_scores = []
    f1_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = np.mean(y_pred == y_test)
        f1 = f1_score(y_test, y_pred, average='weighted')  # F1 score (weighted for multi-class)

        accuracy_scores.append(accuracy)
        f1_scores.append(f1)

    print(f"Average accuracy: {np.mean(accuracy_scores):.4f}")
    print(f"Average F1-score: {np.mean(f1_scores):.4f}")




In [None]:
# Example usage with your models:
logistic_model = LogisticRegression(learning_rate=0.01, num_iterations=1000)
tree_model = DecisionTree(max_depth=10)
svm_model = MultiClassSVM(kernel='rbf', C=1.0, gamma=0.1)

# Perform cross-validation on each model
cross_validate(logistic_model, X, y, num_folds=5)
cross_validate(tree_model, X, y, num_folds=5)
cross_validate(svm_model, X, y, num_folds=5)

In [None]:
# Create instances of your models
logistic_model = LogisticRegression(learning_rate=0.01, num_iterations=1000)
tree_model = DecisionTree(max_depth=10)
svm_model = MultiKernelSVM(kernel='rbf', C=1.0, gamma=0.1)

# Bagging ensemble
bagging_ensemble = BaggingEnsemble([logistic_model, tree_model])
#bagging_ensemble = BaggingEnsemble([logistic_model, tree_model, svm_model])
cross_validate(bagging_ensemble, X, y, num_folds=5)

# Boosting ensemble
boosting_ensemble = BoostingEnsemble([logistic_model, tree_model], num_iterations=10)
#boosting_ensemble = BoostingEnsemble([logistic_model, tree_model, svm_model],, num_iterations=10)

cross_validate(boosting_ensemble, X, y, num_folds=5)