In [1]:
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
data = pd.read_csv('pogoda.csv')
data = data.drop('date', axis=1)

In [3]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    
    # Shuffle the indices
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    # Calculate the number of samples for the test set
    test_samples = int(X.shape[0] * test_size)
    
    # Split the indices into train and test sets
    test_indices = indices[:test_samples]
    train_indices = indices[test_samples:]
    
    # Split the data based on the indices
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]
    
    return X_train, X_test, y_train, y_test

In [4]:
def accuracy_score(y_true, y_pred):
    correct = 0
    total = len(y_true)
    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == pred_label:
            correct += 1
    accuracy = correct / total
    return accuracy

In [5]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if node['leaf']:
            return node['class']

        if x[node['feature_index']] <= node['threshold']:
            return self._traverse_tree(x, node['left'])
        else:
            return self._traverse_tree(x, node['right'])

    def _build_tree(self, X, y, depth=0):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return {'leaf': True, 'class': self._most_common_class(y)}

        feature_index, threshold = self._find_best_split(X, y)
        if feature_index is None or threshold is None:
            return {'leaf': True, 'class': self._most_common_class(y)}

        left_indices = X[:, feature_index] <= threshold
        right_indices = X[:, feature_index] > threshold

        if np.all(left_indices) or np.all(right_indices):
            return {'leaf': True, 'class': self._most_common_class(y)}

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {'leaf': False, 'feature_index': feature_index, 'threshold': threshold,
                'left': left_tree, 'right': right_tree}

    def _find_best_split(self, X, y):
        best_gain = -1
        best_feature_index = None
        best_threshold = None

        n_features = X.shape[1]
        for feature_index in range(n_features):
            feature_values = X[:, feature_index]
            unique_values = np.unique(feature_values)

            for threshold in unique_values:
                gain = self._calculate_information_gain(X, y, feature_index, threshold)

                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def _calculate_information_gain(self, X, y, feature_index, threshold):
        parent_gini = self._gini(y)

        left_indices = X[:, feature_index] <= threshold
        right_indices = X[:, feature_index] > threshold

        left_gini = self._gini(y[left_indices])
        right_gini = self._gini(y[right_indices])

        left_weight = len(y[left_indices]) / len(y)
        right_weight = len(y[right_indices]) / len(y)

        information_gain = parent_gini - (left_weight * left_gini) - (right_weight * right_gini)
        return information_gain

    def _gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        gini = 1 - np.sum(probabilities ** 2)
        return gini

    def _most_common_class(self, y):
        classes, counts = np.unique(y, return_counts=True)
        most_common_index = np.argmax(counts)
        return classes[most_common_index]

In [6]:
class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.estimators = []

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)

        self.estimators = []
        for _ in range(self.n_estimators):
            bootstrap_indices = np.random.choice(len(X), size=len(X), replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]

            estimator = DecisionTree(max_depth=self.max_depth)
            estimator.fit(X_bootstrap, y_bootstrap)
            self.estimators.append(estimator)

    def predict(self, X):
        X = np.array(X)
        predictions = np.array([estimator.predict(X) for estimator in self.estimators])
        unique_values, counts = np.unique(predictions, return_counts=True, axis=0)
        majority_vote = unique_values[np.argmax(counts)]
        return majority_vote

In [7]:
# Set the feature names (column names) explicitly
feature_names = ['precipitation', 'temp_max', 'temp_min', 'wind']
data = data[feature_names + ['weather']]  # Reorder columns

# Separate the features (input) and target (output) columns
X = data[feature_names]
y = data['weather']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('step')


step


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print('step')

In [None]:
y_pred = clf.predict(X_test)
print('step')

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
new_data = [[0.2, 25.0, 15.0, 10.5]]  # Example data for prediction
predicted_weather = clf.predict(new_data)
print('Predicted weather:', predicted_weather)
