In [10]:
import joblib
import pandas as pd
from sklearn.decomposition import TruncatedSVD

In [11]:
# Load the TF-IDF vectorizer and the dataset

tfidf_vectorizer = joblib.load("../No trans data/tfidf_vectorizer_2.pkl")
df_train = pd.read_csv("../No trans data/train_processed.csv")
df_test = pd.read_csv("../No trans data/test_processed.csv")
df_dev = pd.read_csv("../No trans data/dev_processed.csv")

In [12]:
# Apply TF-IDF transformation

X_train_tfidf = tfidf_vectorizer.transform(df_train["text"])
X_test_tfidf = tfidf_vectorizer.transform(df_test["text"])
X_dev_tfidf = tfidf_vectorizer.transform(df_dev["text"])

In [13]:
# Apply Truncated SVD for dimensionality reduction

n_components = 1024
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_train_reduced = svd.fit_transform(X_train_tfidf)
X_test_reduced = svd.transform(X_test_tfidf)
X_dev_reduced = svd.transform(X_dev_tfidf)

In [14]:
# Create DataFrame with reduced features

feature_names = [f"f{i+1}" for i in range(n_components)]

df_train_reduced = pd.DataFrame(X_train_reduced, columns=feature_names)
df_train_reduced.insert(0, "id", df_train["id"])

df_test_reduced = pd.DataFrame(X_test_reduced, columns=feature_names)
df_test_reduced.insert(0, "id", df_test["id"])

df_dev_reduced = pd.DataFrame(X_dev_reduced, columns=feature_names)
df_dev_reduced.insert(0, "id", df_dev["id"])

In [15]:
df_train_reduced.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1015,f1016,f1017,f1018,f1019,f1020,f1021,f1022,f1023,f1024
0,3268,0.000194,-0.000448,-0.000397,-0.000612,0.00625,0.023124,-0.00223,0.000174,-0.000793,...,0.035543,-0.031828,-0.012105,0.024229,-0.004416,-0.01054,-0.000656,-0.022711,-0.00133,-0.022082
1,6239,4.4e-05,-0.000238,-0.000532,-0.000403,0.005419,0.017862,-0.001735,-0.000743,0.001989,...,-0.019042,0.029861,-0.024504,-0.021262,0.001535,-0.009784,-0.00888,-0.015414,-0.038224,-0.013358
2,5859,0.000163,-0.00043,-0.000887,-0.000756,0.010602,0.034897,-0.003434,-0.000732,0.000327,...,0.021173,0.003468,0.01647,0.016832,-0.002572,0.000277,0.004482,0.014311,-0.005429,-0.007738
3,3519,0.146635,-0.094631,-0.052358,0.239139,0.051329,-0.006273,-0.099825,-0.224923,0.037245,...,-0.002057,-0.011877,-0.015151,0.000135,-0.011488,0.005445,-0.029556,0.004443,0.033251,0.001302
4,5136,0.001387,-0.000739,-0.001948,-0.002724,0.021708,0.079995,-0.008127,0.003687,-0.014464,...,-0.007908,0.014679,0.016517,0.01721,0.01048,-0.011816,0.021297,-0.012914,-0.013742,0.009975


In [16]:
# Save to CSV

df_train_reduced.to_csv("../No trans data/train_features.csv", index=False)
df_test_reduced.to_csv("../No trans data/test_features.csv", index=False)
df_dev_reduced.to_csv("../No trans data/dev_features.csv", index=False)