### Load data

In [None]:
# we dont need this anymore as I have modified the file which also gives us processed data for test set
# module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
# module_name = module_url.split('/')[-1]
# print(f'Fetching {module_url}')
# #with open("file_1.txt") as f1, open("file_2.txt") as f2
# with request.urlopen(module_url) as f, open(module_name,'w') as outf:
#   a = f.read()
#   outf.write(a.decode('utf-8'))
import os
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import MultiLabelBinarizer

class DontPatronizeMe:

	def __init__(self, train_path, test_path):

		self.train_path = train_path
		self.test_path = test_path
		self.train_task1_df = None
		self.train_task2_df = None
		self.test_set_df = None

	def load_task1(self):
		"""
		Load task 1 training set and convert the tags into binary labels. 
		Paragraphs with original labels of 0 or 1 are considered to be negative examples of PCL and will have the label 0 = negative.
		Paragraphs with original labels of 2, 3 or 4 are considered to be positive examples of PCL and will have the label 1 = positive.
		It returns a pandas dataframe with paragraphs and labels.
		"""
		rows=[]
		with open(os.path.join(self.train_path, 'dontpatronizeme_pcl.tsv')) as f:
			for line in f.readlines()[4:]:
				par_id=line.strip().split('\t')[0]
				art_id = line.strip().split('\t')[1]
				keyword=line.strip().split('\t')[2]
				country=line.strip().split('\t')[3]
				t=line.strip().split('\t')[4]#.lower()
				l=line.strip().split('\t')[-1]
				if l=='0' or l=='1':
					lbin=0
				else:
					lbin=1
				rows.append(
					{'par_id':par_id,
					'art_id':art_id,
					'keyword':keyword,
					'country':country,
					'text':t, 
					'label':lbin, 
					'orig_label':l
					}
					)
		df=pd.DataFrame(rows, columns=['par_id', 'art_id', 'keyword', 'country', 'text', 'label', 'orig_label']) 
		self.train_task1_df = df

	def load_task2(self, return_one_hot=True):
		# Reads the data for task 2 and present it as paragraphs with binarized labels (a list with seven positions, "activated or not (1 or 0)",
		# depending on wether the category is present in the paragraph).
		# It returns a pandas dataframe with paragraphs and list of binarized labels.
		tag2id = {
				'Unbalanced_power_relations':0,
				'Shallow_solution':1,
				'Presupposition':2,
				'Authority_voice':3,
				'Metaphors':4,
				'Compassion':5,
				'The_poorer_the_merrier':6
				}
		print('Map of label to numerical label:')
		print(tag2id)
		data = defaultdict(list)
		with open (os.path.join(self.train_path, 'dontpatronizeme_categories.tsv')) as f:
			for line in f.readlines()[4:]:
				par_id=line.strip().split('\t')[0]
				art_id = line.strip().split('\t')[1]
				text=line.split('\t')[2]#.lower()
				keyword=line.split('\t')[3]
				country=line.split('\t')[4]
				start=line.split('\t')[5]
				finish=line.split('\t')[6]
				text_span=line.split('\t')[7]
				label=line.strip().split('\t')[-2]
				num_annotators=line.strip().split('\t')[-1]
				labelid = tag2id[label]
				if not labelid in data[(par_id, art_id, text, keyword, country)]:
					data[(par_id,art_id, text, keyword, country)].append(labelid)

		par_ids=[]
		art_ids=[]
		pars=[]
		keywords=[]
		countries=[]
		labels=[]

		for par_id, art_id, par, kw, co in data.keys():
			par_ids.append(par_id)
			art_ids.append(art_id)
			pars.append(par)
			keywords.append(kw)
			countries.append(co)

		for label in data.values():
			labels.append(label)

		if return_one_hot:
			labels = MultiLabelBinarizer().fit_transform(labels)
		df = pd.DataFrame(list(zip(par_ids, 
									art_ids, 
									pars, 
									keywords,
									countries, 
									labels)), columns=['par_id',
														'art_id', 
														'text', 
														'keyword',
														'country', 
														'label',
														])
		self.train_task2_df = df


	def load_test(self):
		#self.test_df = [line.strip() for line in open(self.test_path)]
		rows=[]
		with open(self.test_path) as f:
			for line in f:
				t=line.strip().split('\t')
				rows.append(t)
		self.test_set_df = pd.DataFrame(rows, columns="par_id art_id keyword country text".split())
		#drop the art_id,country columns
		self.test_set_df = self.test_set_df.drop(['art_id','country'], axis=1)
		#rename keyword to community
		self.test_set_df = self.test_set_df.rename(columns={'keyword':'community'})
  


# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')
   

dpm = DontPatronizeMe(r'', r'task4_test.tsv') # train and test paths
dpm.load_task1() # load task 1 training set
dpm.load_test() # load test set


allData = dpm.train_task1_df # load training data

########################################################

trainIDs = pd.read_csv(r'train_semeval_parids-labels.csv')
devIDS = pd.read_csv(r'dev_semeval_parids-labels.csv')
trainIDs.par_id = trainIDs.par_id.astype(str)
devIDS.par_id = devIDS.par_id.astype(str)

# building the trainSet
rows = [] # will contain par_id, label and text
for idx in range(len(trainIDs)):  
  parid = trainIDs.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = allData.loc[allData.par_id == parid].keyword.values[0]
  text = allData.loc[allData.par_id == parid].text.values[0]
  label = allData.loc[allData.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })



trainData = pd.DataFrame(rows)
trainData.head()

rows = [] # will contain par_id, label and text
for idx in range(len(devIDS)):  
  parid = devIDS.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = allData.loc[allData.par_id == parid].keyword.values[0]
  text = allData.loc[allData.par_id == parid].text.values[0]
  label = allData.loc[allData.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


devData = pd.DataFrame(rows)
devData.head()
testData = dpm.test_set_df
testData.head()

### Logistic regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


def extract_features(field,training_data,testing_data,type="tfidf"):
    """Extract features using different methods"""
    
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.7)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field])
        test_feature_set=cv.transform(testing_data[field])
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.7)
        cv.fit_transform(training_data[field])
        
        train_feature_set=cv.transform(training_data[field])
        test_feature_set=cv.transform(testing_data[field])
        
        return train_feature_set,test_feature_set,cv
    
    else:    
      
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.7)
        tfidf_vectorizer.fit_transform(training_data[field])
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field])
        test_feature_set=tfidf_vectorizer.transform(testing_data[field])   

        return train_feature_set,test_feature_set,tfidf_vectorizer


Y_train=trainData['label'].values
Y_test=devData['label'].values
X_train,X_test,feature_transformer=extract_features('text',trainData,devData)

scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=400)
model=scikit_log_reg.fit(X_train,Y_train)

predictions=model.predict(X_test)

print(f1_score(Y_test, predictions,pos_label=1, average='binary'))
print(recall_score(Y_test, predictions, average='binary'))




[LibLinear]0.2122448979591837
0.1306532663316583
