In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Load the training data
train_data = df
train_data.dropna(inplace=True)
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['text_en'], train_data['label'], test_size=0.1, random_state=42)

# Load the pre-trained DistilBERT model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

# Encode the training and validation data to get sentence embeddings
X_train_embeddings = model.encode(X_train.tolist())
X_val_embeddings = model.encode(X_val.tolist())

# Define hyperparameters for logistic regression
param_grid = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']}

# Initialize logistic regression classifier
classifier = LogisticRegression(max_iter=1000)

# Perform grid search with cross-validation
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='f1', verbose=1)
grid_search.fit(X_train_embeddings, y_train)

# Get the best model from grid search
best_classifier = grid_search.best_estimator_

# Evaluate the classifier on the validation set
val_predictions = best_classifier.predict(X_val_embeddings)
val_f1 = f1_score(y_val, val_predictions)
print("Validation F1 Score:", val_f1)

In [None]:
import joblib

# Save the trained classifier
joblib.dump(best_classifier, 'power-distil-logistic-f1_.joblib')

print("Classifier saved successfully!")


In [None]:
test_predictions = best_classifier.predict(X_train_embeddings)
val_f1 = f1_score(y_train, test_predictions)
print("Validation F1 Score:", val_f1)

In [None]:
import joblib
loaded_classifier = joblib.load('/content/drive/MyDrive/Touche clef/Distil-logistic-f1_74.joblib')

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [None]:
# Load the test data
test_data = df['text_en']
test_data.fillna('not possible',inplace=True)

X_test_embeddings = model.encode(test_data.tolist())
# Make predictions on the test data
test_predictions = loaded_classifier.predict(X_test_embeddings)

# Save the predictions to a CSV file
output_df = pd.DataFrame({'id': df['id'], 'label': test_predictions})
output_df.to_csv("predictions.csv", index=False)

In [None]:
from sklearn.metrics import f1_score
val_f1 = f1_score(df['label'], test_predictions)
print("Validation F1 Score:", val_f1)

In [None]:
import zipfile
import os

def unzip_file(zip_path, extract_to):
    if not os.path.exists(zip_path):
        print(f"Error: The file {zip_path} does not exist.")
        return

    # Check if the directory to extract to exists, create it if it doesn't
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    # Unzip the file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"Successfully unzipped {zip_path} to {extract_to}")

# Example usage
zip_path = '/content/drive/MyDrive/Touche clef/ideology-power-st-testset.zip'
extract_to = '/content/cont'

unzip_file(zip_path, extract_to)


In [None]:
all="ua"
df1 = pd.read_table('/content/cont/power/power-'+all+'-test.tsv')
test_data = df1['text_en']
test_data.fillna('not possible',inplace=True)

X_test_embeddings = model.encode(test_data.tolist())
# Make predictions on the test data
test_predictions = loaded_classifier.predict(X_test_embeddings)

# Save the predictions to a CSV file
output_df = pd.DataFrame({'id': df1['id'], 'label': test_predictions})
output_df.to_csv("pixel-power-"+all+"-run3.tsv", index=False, header=False, sep='\t')

In [None]:
val_f1 = f1_score(df1['label'], test_predictions)
print("Validation F1 Score:", val_f1)