In [None]:
import joblib
import pandas as pd
from sklearn.decomposition import TruncatedSVD

In [None]:
# Load the TF-IDF vectorizer and the dataset

tfidf_vectorizer = joblib.load("../Trans data/tfidf_vectorizer_1.pkl")
df_train = pd.read_csv("../Trans data/train_processed.csv")
df_test = pd.read_csv("../Trans data/test_processed.csv")
df_dev = pd.read_csv("../Trans data/dev_processed.csv")

In [None]:
# Apply TF-IDF transformation

X_train_tfidf = tfidf_vectorizer.transform(df_train["text"])
X_test_tfidf = tfidf_vectorizer.transform(df_test["text"])
X_dev_tfidf = tfidf_vectorizer.transform(df_dev["text"])

In [None]:
# Apply Truncated SVD for dimensionality reduction

n_components = 2048
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_train_reduced = svd.fit_transform(X_train_tfidf)
X_test_reduced = svd.transform(X_test_tfidf)
X_dev_reduced = svd.transform(X_dev_tfidf)

In [None]:
# Create DataFrame with reduced features

feature_names = [f"f{i+1}" for i in range(n_components)]

df_train_reduced = pd.DataFrame(X_train_reduced, columns=feature_names)
df_train_reduced.insert(0, "id", df_train["id"])

df_test_reduced = pd.DataFrame(X_test_reduced, columns=feature_names)
df_test_reduced.insert(0, "id", df_test["id"])

df_dev_reduced = pd.DataFrame(X_dev_reduced, columns=feature_names)
df_dev_reduced.insert(0, "id", df_dev["id"])

In [None]:
df_train_reduced.head()

In [None]:
# Save to CSV

df_train_reduced.to_csv("../Trans data/train_features.csv", index=False)
df_test_reduced.to_csv("../Trans data/test_features.csv", index=False)
df_dev_reduced.to_csv("../Trans data/dev_features.csv", index=False)