In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("cleaned.csv")
df = df.drop(['Deskripsi'], axis=1)
df.head()

Unnamed: 0,Preferensi,Cleaned
0,['Kopisang (es kopi susu dengan campuran peris...,minum bahan dasar kopi inginya manis milik dingin
1,['MangoBoost (campuran sirup mangga dengan sus...,minum dingin bahan dasar susu
2,['Chocobanana (es coklat dengan campuran peris...,minum bahan dasar coklat milik perisa dingin
3,['Tubruk (seduhan dari gilingan kasar dari kop...,minum bahan dasar kopi panas
4,['Ice Lychee Tea (teh leci dengan buah leci)'],menginkan minum bahan dasar teh dingin


In [16]:
# Get first 3 rows and save to new df
df_manualisasi = df.head(3)
# Saae to csv
df_manualisasi.to_csv('manualisasi.csv', index=False)

In [65]:
# TF IDF From Scratchl
import math

class CustomTFIDFVectorizer:
    def __init__(self):
        self.vocab = set()
        self.idf = {}
        self.doc_count = 0

    def fit_transform(self, documents):
        self.doc_count = len(documents)

        # Step 1: Calculate Term Frequencies (TF)
        tf_matrix = []
        for doc in documents:
            tf_doc = {}
            for word in doc.split():
                self.vocab.add(word)
                tf_doc[word] = (tf_doc.get(word, 0) + 1) / len(doc.split())
            tf_matrix.append(tf_doc)
        # Print TF Matrix
        print("TF Matrix:")
        for item in tf_matrix:
            print(item)
        
        # Step 2: Calculate Inverse Document Frequencies (IDF)
        for word in self.vocab:
            doc_count_with_word = sum(1 for doc in documents if word in doc)
            self.idf[word] = math.log(self.doc_count / (doc_count_with_word + 1), 10)
        # Print IDF
        print("IDF:")
        for word, idf in self.idf.items():
            print(f"{word}: {idf}")

        # Step 3: Compute TF-IDF Scores
        tfidf_matrix = []
        for tf_doc in tf_matrix:
            tfidf_doc = {}
            for word, tf in tf_doc.items():
                tfidf_doc[word] = tf * self.idf[word]
            tfidf_matrix.append(tfidf_doc)

        return tfidf_matrix

In [66]:
# Use TF-IDF to vectorize the words in Cleaned columns
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf2 = TfidfVectorizer()
X_manualisasi2 = tfidf2.fit_transform(df_manualisasi["Cleaned"])

tfidf = CustomTFIDFVectorizer()
X_manualisasi = tfidf.fit_transform(df_manualisasi["Cleaned"])
# X_manualisasi

TF Matrix:
{'minum': 0.125, 'bahan': 0.125, 'dasar': 0.125, 'kopi': 0.125, 'inginya': 0.125, 'manis': 0.125, 'milik': 0.125, 'dingin': 0.125}
{'minum': 0.2, 'dingin': 0.2, 'bahan': 0.2, 'dasar': 0.2, 'susu': 0.2}
{'minum': 0.14285714285714285, 'bahan': 0.14285714285714285, 'dasar': 0.14285714285714285, 'coklat': 0.14285714285714285, 'milik': 0.14285714285714285, 'perisa': 0.14285714285714285, 'dingin': 0.14285714285714285}
Vocab: {'dingin', 'dasar', 'bahan', 'perisa', 'inginya', 'susu', 'coklat', 'kopi', 'manis', 'milik', 'minum'}
-0.12493873660829993
IDF:
dingin: -0.12493873660829993
dasar: -0.12493873660829993
bahan: -0.12493873660829993
perisa: 0.17609125905568124
inginya: 0.17609125905568124
susu: 0.17609125905568124
coklat: 0.17609125905568124
kopi: 0.17609125905568124
manis: 0.17609125905568124
milik: 0.0
minum: -0.12493873660829993


In [39]:
X_manualisasi

[{'minum': -0.2876820724517809,
  'bahan': -0.2876820724517809,
  'dasar': -0.2876820724517809,
  'kopi': 0.4054651081081644,
  'inginya': 0.4054651081081644,
  'manis': 0.4054651081081644,
  'milik': 0.0,
  'dingin': -0.2876820724517809},
 {'minum': -0.2876820724517809,
  'dingin': -0.2876820724517809,
  'bahan': -0.2876820724517809,
  'dasar': -0.2876820724517809,
  'susu': 0.4054651081081644},
 {'minum': -0.2876820724517809,
  'bahan': -0.2876820724517809,
  'dasar': -0.2876820724517809,
  'coklat': 0.4054651081081644,
  'milik': 0.0,
  'perisa': 0.4054651081081644,
  'dingin': -0.2876820724517809}]

In [40]:
# Print X_manualisasi2
print(X_manualisasi2)

  (0, 3)	0.2648287324074431
  (0, 7)	0.3410152109911944
  (0, 6)	0.44839402160692654
  (0, 4)	0.44839402160692654
  (0, 5)	0.44839402160692654
  (0, 2)	0.2648287324074431
  (0, 0)	0.2648287324074431
  (0, 8)	0.2648287324074431
  (1, 10)	0.6461289150464732
  (1, 3)	0.3816141458138271
  (1, 2)	0.3816141458138271
  (1, 0)	0.3816141458138271
  (1, 8)	0.3816141458138271
  (2, 9)	0.5016513317715935
  (2, 1)	0.5016513317715935
  (2, 3)	0.2962833577206743
  (2, 7)	0.3815187681027303
  (2, 2)	0.2962833577206743
  (2, 0)	0.2962833577206743
  (2, 8)	0.2962833577206743


In [3]:
# # Example usage
# documents = [
#     "this is the first document",
#     "this document is the second document",
#     "and this is the third one",
#     "is this the first document",
# ]

# custom_tfidf = CustomTFIDFVectorizer()
# tfidf_matrix = custom_tfidf.fit_transform(documents)

# # Print TF-IDF scores for the first document
# for word, tfidf_score in tfidf_matrix[0].items():
#     print(f"{word}: {tfidf_score}")

In [4]:
# Use TF-IDF to vectorize the words in Cleaned columns
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = CustomTFIDFVectorizer()
X = tfidf.fit_transform(df["Cleaned"])
y = df["Preferensi"]

In [5]:
X

[{'minum': 0.12279557747992764,
  'bahan': 0.8209805520698303,
  'dasar': 0.8728056199344163,
  'kopi': 0.7911275889201491,
  'inginya': 4.722953221644475,
  'manis': 1.8897398775882588,
  'milik': 3.1135153092103742,
  'dingin': 0.9734491457141037},
 {'minum': 0.12279557747992764,
  'dingin': 0.9734491457141037,
  'bahan': 0.8209805520698303,
  'dasar': 0.8728056199344163,
  'susu': 0.9162907318741551},
 {'minum': 0.12279557747992764,
  'bahan': 0.8209805520698303,
  'dasar': 0.8728056199344163,
  'coklat': 1.4271163556401458,
  'milik': 3.1135153092103742,
  'perisa': 1.5040773967762742,
  'dingin': 0.9734491457141037},
 {'minum': 0.12279557747992764,
  'bahan': 0.8209805520698303,
  'dasar': 0.8728056199344163,
  'kopi': 0.7911275889201491,
  'panas': 2.643511679964639},
 {'menginkan': 4.722953221644475,
  'minum': 0.12279557747992764,
  'bahan': 0.8209805520698303,
  'dasar': 0.8728056199344163,
  'teh': 2.7770430725891617,
  'dingin': 0.9734491457141037},
 {'minum': 0.122795577479

In [13]:
y

0      ['Kopisang (es kopi susu dengan campuran peris...
1      ['MangoBoost (campuran sirup mangga dengan sus...
2      ['Chocobanana (es coklat dengan campuran peris...
3      ['Tubruk (seduhan dari gilingan kasar dari kop...
4         ['Ice Lychee Tea (teh leci dengan buah leci)']
                             ...                        
220                                        ['Kopi Susu']
221                                        ['Kopi Susu']
222                                        ['Kopi Susu']
223                                        ['Kopi Susu']
224                                        ['Kopi Susu']
Name: Preferensi, Length: 225, dtype: object

In [6]:
X_train = np.array([[sample.get(word, 0) for word in X[0]] for sample in X])

In [7]:
X_train

array([[0.12279558, 0.82098055, 0.87280562, ..., 1.88973988, 3.11351531,
        0.97344915],
       [0.12279558, 0.82098055, 0.87280562, ..., 0.        , 0.        ,
        0.97344915],
       [0.12279558, 0.82098055, 0.87280562, ..., 0.        , 3.11351531,
        0.97344915],
       ...,
       [0.12279558, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.12279558, 0.82098055, 0.87280562, ..., 0.        , 0.        ,
        0.97344915],
       [0.12279558, 0.        , 0.        , ..., 0.        , 0.        ,
        0.97344915]])

In [11]:
import numpy as np
#import decision_tree scikit-learn
from sklearn.tree import DecisionTreeClassifier

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def train(self, X, y):
        # Define a stopping criterion
        def should_stop(y):
            return len(set(y)) == 1 or (self.max_depth is not None and len(y) <= self.max_depth)

        # Define a function to find the best split
        def find_best_split(X, y):
            best_gini = float('inf')
            best_split = None
            for feature in range(X.shape[1]):
                unique_values = np.unique(X[:, feature])
                for value in unique_values:
                    left_mask = X[:, feature] <= value
                    right_mask = ~left_mask
                    gini = (np.sum(left_mask) * gini_impurity(y[left_mask]) +
                            np.sum(right_mask) * gini_impurity(y[right_mask])) / len(y)
                    if gini < best_gini:
                        best_gini = gini
                        best_split = (feature, value)
            return best_split
        
        def gini_impurity(y):
            if len(y) == 0:
                return 0
            p = np.bincount(y) / len(y)
            return 1 - np.sum(p ** 2)

        # Define a recursive function to build the tree
        def build_tree(X, y, depth):
            if should_stop(y) or (self.max_depth is not None and depth == self.max_depth):
                return {'class': np.bincount(y).argmax()}
            feature, value = find_best_split(X, y)
            if feature is None:
                return {'class': np.bincount(y).argmax()}
            left_mask = X[:, feature] <= value
            right_mask = ~left_mask
            left_tree = build_tree(X[left_mask], y[left_mask], depth + 1)
            right_tree = build_tree(X[right_mask], y[right_mask], depth + 1)
            return {
                'feature': feature,
                'value': value,
                'left': left_tree,
                'right': right_tree
            }

        self.tree = build_tree(X, y, 0)

    def predict(self, X):
        def predict_single(x, node):
            if 'class' in node:
                return node['class']
            if x[node['feature']] <= node['value']:
                return predict_single(x, node['left'])
            else:
                return predict_single(x, node['right'])

        return np.array([predict_single(x, self.tree) for x in X])


class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            # Step 1: Bootstrap Sampling
            sample_indices = np.random.choice(range(len(X)), size=len(X), replace=True)
            X_sampled = X[sample_indices]
            y_sampled = y[sample_indices]

            # Step 2: Feature Selection
            if self.max_features is not None:
                feature_indices = np.random.choice(range(X.shape[1]), size=self.max_features, replace=False)
                X_sampled = X_sampled[:, feature_indices]

            # Step 3: Build Decision Trees
            tree = DecisionTreeClassifier(max_depth=self.max_depth)
            tree.fit(X_sampled, y_sampled)
            # tree = DecisionTree(max_depth=self.max_depth)
            # tree.train(X_sampled, y_sampled)
            self.trees.append(tree)

    def predict(self, X):
        # Step 4: Aggregate Predictions
        predictions = np.zeros((len(X), len(self.trees)))
        for i, tree in enumerate(self.trees):
            predictions[:, i] = tree.predict(X)

        # Aggregate predictions from all trees
        final_predictions = np.mean(predictions, axis=1)
        return final_predictions


In [12]:
# Assuming tfidf_matrix is in the format you provided
# Convert it to a 2D numpy array
X_train = np.array([[sample.get(word, 0) for word in X[0]] for sample in X])

# Assuming y_train is a list of labels
# Convert it to a numpy array if it's not already
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y)

# Create a RandomForestClassifier instance
rf = RandomForestClassifier(n_estimators=100, max_depth=5)

# Train the random forest
rf.fit(X_train, y_train)

# Make predictions
predictions = rf.predict(X_train)

ValueError: attempt to get argmax of an empty sequence