In [None]:
!pip install simpletransformers --quiet

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.metrics import mean_squared_error as mse
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm

from simpletransformers.classification.classification_model import ClassificationModel
from sklearn.model_selection import train_test_split
# from scipy.special import softmax




### Importing in the data

In [None]:
train_set = pd.read_csv('Train.csv')
test_set = pd.read_csv('Test.csv')
sample_submission = pd.read_csv('SampleSubmission.csv')

test_set.head()

In [None]:
#Cleaning up the training set
train_set["safe_text"].apply(lambda x: len(x)).describe()
train_set["label"].value_counts()

In [None]:
#Try setting this line to neutral
train_set['label'][~train_set['label'].isin([0,-1,1])]=-1

In [None]:
train_set.isnull().sum()

In [None]:
train_set = train_set.dropna()



In [None]:
#Cleaning the test set
test_set.isnull().sum()

In [None]:
test_set["safe_text"] = test_set["safe_text"].fillna("xxxxxx")
train_set.size

### Training the models






In [None]:
def create_model(model_type,model_name,epochs=2,train_batch_size=110,eval_batch_size=130,max_seq_len=134,learning_rate=2e-5):
  reg_model = ClassificationModel(model_type,model_name,num_labels=1,args={'train_batch_size':train_batch_size,
                                                                         'eval_batch_size': eval_batch_size,
                                                                         'reprocess_input_data': True,
                                                                         'overwrite_output_dir': True,
                                                                         'fp16': False,
                                                                         'do_lower_case': False,
                                                                         'num_train_epochs':epochs,
                                                                         'max_seq_length': max_seq_len,
                                                                         'regression': True,
                                                                         'manual_seed': 2,
                                                                         'learning_rate':learning_rate,
                                                                         "save_eval_checkpoints": False,
                                                                         "save_model_every_epoch": False,})
  return reg_model 


In [None]:
temp_df = pd.DataFrame()
temp_df['safe_text'] = train_set["safe_text"]
temp_df['labels'] = train_set["label"]
temp_test = test_set[['safe_text']].rename({'safe_text':'text'},axis=1)
temp_test['labels'] = 0
temp_train,temp_eval = train_test_split(temp_df,test_size=0.3,random_state=3)

In [None]:
model1 = create_model('roberta','roberta-base',epochs=3)
model1.train_model(temp_train)
preds_val = model1.eval_model(temp_eval)[1]
preds_val = np.clip(preds_val,-1,1)
print(f"MSE: {mse(temp_eval['labels'],preds_val)}\n RMSE:{(mse(temp_eval['labels'],preds_val))**0.5}")
test_preds = model1.eval_model(temp_test)[1]
test_preds = np.clip(test_preds,-1,1)
pev_1 = preds_val
pt_1 = test_preds

In [None]:
model2 = create_model('roberta','roberta-large',epochs=3,train_batch_size=16,eval_batch_size=16,learning_rate=1e-5)
model2.train_model(temp_train)
preds_val = model2.eval_model(temp_eval)[1]
preds_val = np.clip(preds_val,-1,1)
print(f"MSE: {mse(temp_eval['labels'],preds_val)}\n RMSE:{(mse(temp_eval['labels'],preds_val))**0.5}")
test_preds = model2.eval_model(temp_test)[1]
test_preds = np.clip(test_preds,-1,1)
pev_2 = preds_val
pt_2 = test_preds

In [None]:
final_pv = (pev_1 * 0.3) + (pev_2 * 0.7)
print(f"RMSE: {mse(temp_eval['labels'],final_pv)**0.5}")

In [None]:
tp = (pt_1*0.3) + (pt_2*0.7)

In [None]:
final_preds = tp
final_df = test_set[["ID_COL"]]

In [None]:
final_df["TARGET_COL"] = final_preds
submission_file_name = "NLP_challenge_solution.csv"
final_df.to_csv(submission_file_name,index=False)

In [None]:
final_df.head()