In [3]:
import joblib
import pandas as pd
from sklearn.decomposition import TruncatedSVD

In [4]:
# Load the TF-IDF vectorizer and the dataset

tfidf_vectorizer = joblib.load("../Trans data/tfidf_vectorizer_1.pkl")
df_train = pd.read_csv("../Trans data/train_processed.csv")
df_test = pd.read_csv("../Trans data/test_processed.csv")
df_dev = pd.read_csv("../Trans data/dev_processed.csv")

In [5]:
# Apply TF-IDF transformation

X_train_tfidf = tfidf_vectorizer.transform(df_train["text"])
X_test_tfidf = tfidf_vectorizer.transform(df_test["text"])
X_dev_tfidf = tfidf_vectorizer.transform(df_dev["text"])

In [6]:
# Apply Truncated SVD for dimensionality reduction

n_components = 2048
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_train_reduced = svd.fit_transform(X_train_tfidf)
X_test_reduced = svd.transform(X_test_tfidf)
X_dev_reduced = svd.transform(X_dev_tfidf)

In [7]:
# Create DataFrame with reduced features

feature_names = [f"f{i+1}" for i in range(n_components)]

df_train_reduced = pd.DataFrame(X_train_reduced, columns=feature_names)
df_train_reduced.insert(0, "id", df_train["id"])

df_test_reduced = pd.DataFrame(X_test_reduced, columns=feature_names)
df_test_reduced.insert(0, "id", df_test["id"])

df_dev_reduced = pd.DataFrame(X_dev_reduced, columns=feature_names)
df_dev_reduced.insert(0, "id", df_dev["id"])

In [8]:
df_train_reduced.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f2039,f2040,f2041,f2042,f2043,f2044,f2045,f2046,f2047,f2048
0,3268,0.031737,-0.013256,-0.011682,0.011715,-0.018533,-0.002068,0.003632,0.007394,-0.014823,...,-0.001307,0.00726,0.004889,-0.004398,0.010547,-0.007896,0.008926,-0.004852,-0.007934,0.008657
1,6239,0.028074,-0.011921,-0.01708,0.02546,-0.03109,-0.003898,0.00863,-0.010364,-0.001128,...,0.009855,-0.014453,-0.003728,-0.017318,-0.002309,0.013377,0.024118,0.002998,-0.016601,-0.005901
2,5859,0.062934,-0.025787,-0.028166,0.025336,-0.046992,-0.017524,0.002499,0.009294,-0.012343,...,-0.021791,-0.000702,-0.010636,0.000495,-0.003531,0.004807,-0.002697,-0.002785,0.006377,-0.002189
3,3519,0.023265,-0.006034,-0.026454,0.005153,0.003068,0.003744,-0.009146,-0.006578,-0.0069,...,-0.002776,0.002421,0.003749,0.00292,-0.004276,0.003716,0.008484,0.004947,-0.002026,-0.00891
4,5136,0.072027,-0.040623,0.02123,0.04613,-0.082095,-0.023734,0.015649,0.133416,-0.029916,...,-0.003034,0.011392,-0.007597,0.010209,0.00018,-0.009322,0.008983,0.010921,-0.012896,-0.002455


In [9]:
# Save to CSV

df_train_reduced.to_csv("../Trans data/train_features.csv", index=False)
df_test_reduced.to_csv("../Trans data/test_features.csv", index=False)
df_dev_reduced.to_csv("../Trans data/dev_features.csv", index=False)