In [9]:
!pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.16.1-cp310-cp310-win_amd64.whl.metadata (3.5 kB)
Collecting tensorflow-intel==2.16.1 (from tensorflow)
  Using cached tensorflow_intel-2.16.1-cp310-cp310-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.16.1->tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.16.1->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow-intel==2.16.1->tensorflow)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading gast-0.5.5.tar.gz (26 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.16.1->tensorflow)
  Using cached google_pas


[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

from src import paths

In [2]:
labelled = pd.read_parquet(paths.INTERMEDIATE_DATA_PATH / "training_data_embedding.parquet")
labelled.head()

Unnamed: 0,post,post_risk,embedding
0,Just kill me. Please! Just end my life! I beg ...,ideation,"[0.05026989057660103, 0.00029446851112879813, ..."
1,"There is no today, nor tommorrow. I dont have ...",behavior,"[-0.007263310253620148, 0.01930110529065132, -..."
2,Life so full of contradictions that it's not w...,indicator,"[0.034987498074769974, 0.022024665027856827, -..."
3,I think I'm going to kill myself soon. I don't...,behavior,"[-0.005158697720617056, 0.020152874290943146, ..."
4,whats the point of living. no really. is there...,ideation,"[0.011075956746935844, 0.054204490035772324, 0..."


In [3]:
labels = labelled['post_risk'].values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

embeddings = np.array(labelled['embedding'].tolist())
X_train, X_test, y_train, y_test = train_test_split(embeddings, encoded_labels, test_size=0.2, random_state=42)

In [6]:
clf = RandomForestClassifier(random_state=42)

# Define the hyperparameters grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
# Create a scorer using F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

# Initialize GridSearchCV
grid_search = GridSearchCV(clf, param_grid, scoring=f1_scorer, cv=5, verbose=3)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

results = grid_search.cv_results_
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"F1 Score: {mean_score:.4f} for Params: {params}")


# Evaluate the model
y_pred = best_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Best model parameters: {grid_search.best_params_}")
print(f"F1 Score: {f1}")

# Optionally decode the predictions back to original string labels
y_pred_labels = label_encoder.inverse_transform(y_pred)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END max_depth=None, min_samples_split=2, n_estimators=50;, score=0.478 total time=   0.7s
[CV 2/5] END max_depth=None, min_samples_split=2, n_estimators=50;, score=0.454 total time=   0.6s
[CV 3/5] END max_depth=None, min_samples_split=2, n_estimators=50;, score=0.440 total time=   0.5s
[CV 4/5] END max_depth=None, min_samples_split=2, n_estimators=50;, score=0.442 total time=   0.5s
[CV 5/5] END max_depth=None, min_samples_split=2, n_estimators=50;, score=0.489 total time=   0.5s
[CV 1/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=0.473 total time=   1.0s
[CV 2/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=0.418 total time=   0.9s
[CV 3/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=0.483 total time=   1.4s
[CV 4/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=0.469 total time=   1.0s
[CV 5/5] END max_depth=None, min_samples_sp

In [7]:
df = labelled

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K

# Example DataFrame
# df = pd.DataFrame({
#     'embedding': [list(np.random.rand(100)) for _ in range(500)],
#     'label': ['label1', 'label2', 'label3', 'label4'] * 125
# })

# Convert the DataFrame
embeddings = np.array(df['embedding'].tolist())
labels = df['label']

# Encode string labels to integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(embeddings, encoded_labels, test_size=0.2, random_state=42)

# Define weighted F1 score as a custom metric
def weighted_f1_score(y_true, y_pred):
    y_true = K.flatten(y_true)
    y_pred = K.argmax(y_pred, axis=-1)
    return f1_score(y_true, y_pred, average='weighted')

# Define the neural network architecture
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model with weighted F1 score as a metric
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=[weighted_f1_score])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_weighted_f1_score', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping],
                    verbose=1)

# Evaluate the model
_, weighted_f1 = model.evaluate(X_test, y_test, verbose=0)

print(f"Weighted F1 Score on Test Set: {weighted_f1:.4f}")

# Optionally, make predictions and decode labels
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred)


ModuleNotFoundError: No module named 'tensorflow'