In [None]:
import pandas as pd
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/dataset/cyberbullying_tweets.csv'

In [None]:
dataset = pd.read_csv(file_path)

In [None]:
print(dataset.head())

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  If you are feeling perturbed about women that ...  not_cyberbullying


In [None]:
print(dataset.isnull().sum())

tweet_text            0
cyberbullying_type    0
dtype: int64


In [None]:
X = dataset["tweet_text"]  # Características
y = dataset["cyberbullying_type"]  # Etiquetas

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# TLN
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Tokenización
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    return tokens

In [None]:
# Eliminación de stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [None]:
# Preprocesamiento
def preprocess_text(text):
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    return ' '.join(tokens)

In [None]:
# Preprocesamiento a todas las características
X_preprocessed = X.apply(preprocess_text)

In [None]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X_preprocessed)

In [None]:
# Matriz TF-IDF resultante
print("Matriz TF-IDF:", X_tfidf.shape)

Matriz TF-IDF: (46976, 59652)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Conjuntos de entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        if self.max_depth is not None and depth >= self.max_depth:
            return {'label': max(set(y), key=list(y).count)}  # Devuelve la clase mayoritaria en caso de alcanzar la profundidad max
        if len(np.unique(y)) == 1:
            return {'label': y[0]}

        best_split = self._find_best_split(X, y)
        if best_split is None:
            return {'label': max(set(y), key=list(y).count)}  # Devuelve la clase mayoritaria si no hay división posible

        left_idxs = X[:, best_split['feature']] < best_split['threshold']
        right_idxs = ~left_idxs

        if np.sum(left_idxs) == 0 or np.sum(right_idxs) == 0:
            return {'label': max(set(y), key=list(y).count)}

        left_subtree = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
        right_subtree = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)

        return {'feature': best_split['feature'],
                'threshold': best_split['threshold'],
                'left': left_subtree,
                'right': right_subtree}

    def _find_best_split(self, X, y):
        best_gini = 1.0
        best_split = None
        n_features = X.shape[1]

        for feature in range(n_features):
            thresholds = np.percentile(X[:, feature], [10, 25, 50, 75, 90])
            for threshold in thresholds:
                left_idxs = X[:, feature] < threshold
                if np.sum(left_idxs) == 0 or np.sum(left_idxs) == len(y):
                    continue
                gini = self._gini_impurity(y[left_idxs]) * np.mean(left_idxs) + \
                       self._gini_impurity(y[~left_idxs]) * np.mean(~left_idxs)
                if gini < best_gini:
                    best_gini = gini
                    best_split = {'feature': feature, 'threshold': threshold}
        return best_split

    def _gini_impurity(self, y):
        if len(y) == 0:
            return 0
        class_probs = np.array([np.mean(y == c) for c in np.unique(y)])
        return 1 - np.sum(class_probs ** 2)

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) if self.tree is not None else None for x in X])

    def _predict_tree(self, x, tree):
        if tree is None:
            return None
        if 'label' in tree:
            return tree['label']
        if x[tree['feature']] < tree['threshold']:
            return self._predict_tree(x, tree['left'])
        else:
            return self._predict_tree(x, tree['right'])

from sklearn.metrics import accuracy_score, classification_report

tree = DecisionTree(max_depth=2)
tree.fit(X_train_dense, y_train)
y_pred = tree.predict(X_test_dense)

accuracy = accuracy_score(y_test, y_pred)
print("Precisión del modelo:", accuracy)
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))

Precisión del modelo: 0.3468497232865049
Reporte de clasificación:
                     precision    recall  f1-score   support

                age       0.97      0.52      0.68      1572
          ethnicity       0.97      0.59      0.73      1545
             gender       0.20      0.99      0.34      1552
  not_cyberbullying       0.00      0.00      0.00      1564
other_cyberbullying       0.00      0.00      0.00      1553
           religion       0.00      0.00      0.00      1610

           accuracy                           0.35      9396
          macro avg       0.36      0.35      0.29      9396
       weighted avg       0.36      0.35      0.29      9396



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
