# Twitter Data Sentiment Classification

For this exercise we will be using the "SemEval 2017 task 4" corpus provided on the module website, available through the following [link](https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs918/semeval-tweets.tar.bz2). We will focus particularly on Subtask A, i.e. classifying the overall sentiment of a tweet as positive, negative or neutral.

In [None]:
#### Import necessary packages
import nltk
import re
from os.path import join
import numpy as np
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# Define test sets
testsets = ['twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt']

In [2]:
# Skeleton: Evaluation code for the test sets
def read_test(testset):
    '''
    readin the testset and return a dictionary
    :param testset: str, the file name of the testset to compare
    '''
    id_gts = {}
    with open(testset, 'r', encoding='utf8') as fh:
        for line in fh:
            fields = line.split('\t')
            tweetid = fields[0]
            gt = fields[1]

            id_gts[tweetid] = gt

    return id_gts


def confusion(id_preds, testset, classifier):
    '''
    print the confusion matrix of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    gts = []
    for m, c1 in id_gts.items():
        if c1 not in gts:
            gts.append(c1)

    gts = ['positive', 'negative', 'neutral']

    conf = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print('%.3f     ' % (conf[c1][c2] / float(sum(conf[c1].values()))), end='')
            else:
                print('0.000     ', end='')
        print('')

    print('')


def evaluate(id_preds, testset, classifier):
    '''
    print the macro-F1 score of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset) #{tweet id : sentiment}

    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    catf1s = {}

    ok = 0
    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    catcount = 0
    itemcount = 0
    macro = {'p': 0, 'r': 0, 'f1': 0}
    micro = {'p': 0, 'r': 0, 'f1': 0}
    semevalmacro = {'p': 0, 'r': 0, 'f1': 0}

    microtp = 0
    microfp = 0
    microtn = 0
    microfn = 0
    for cat, acc in acc_by_class.items():
        catcount += 1

        microtp += acc['tp']
        microfp += acc['fp']
        microtn += acc['tn']
        microfn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        catf1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            semevalmacro['p'] += p
            semevalmacro['r'] += r
            semevalmacro['f1'] += f1

        itemcount += n

    micro['p'] = float(microtp) / float(microtp + microfp)
    micro['r'] = float(microtp) / float(microtp + microfn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    semevalmacrof1 = semevalmacro['f1'] / 2

    print(testset + ' (' + classifier + '): %.3f' % semevalmacrof1)

Data Preprocessing

In [None]:
def regex_process(text):
    processed = re.sub(r"(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", "", text)
    # processed = re.sub(r"((https?|ftp)://)?[a-z0-9\-._~:/?#\[\]@!$&'()*+,;=%]+\.[a-z]{2,}[a-z0-9\-._~:/?#\[\]@!$&'()*+,;=%]+","",text) #url removed
    processed = re.sub(r"&[a-zA-Z0-9]+;", "",processed) # html entity removal
    processed = re.sub(r"@\w+","",processed) #@user mentions handling
    processed = re.sub(r"(\.|!|\?)"," ",processed)
    processed = re.sub(r"[^A-Za-z0-9 ]","",processed) # removing alphanumeric characters excluding space
    processed = re.sub(r"\s+"," ",processed)   # white space removed
    processed = re.sub(r"\b[0-9]+\b","",processed)  # numbers removed

    return re.sub(r"\b[a-z0-9]\b","",processed)
def remove_stopwords(texts):
    stop_words = set(stopwords.words('english'))
    filtered = [text for text in texts.split() if text not in stop_words]
    return ' '.join(filtered)

def get_pos(tag):
    if tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN 
    
def lemmatize(text):
    texts = text.split()
    lm = nltk.stem.WordNetLemmatizer()
    pos_tags = nltk.pos_tag(texts)
    words = [lm.lemmatize(word, pos=get_pos(tag)) for word, tag in pos_tags]
    return ' '.join(words)

def preprocess(text):
    text = regex_process(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    return text

In [None]:
#test sample
example = '@DaniBooThang @indeliblemarq__ my cousin didnt know https://t.co/cDirIVdAEC from the Friday movies lol'
preprocess(example)

'cousin didnt know Friday movie lol'

#### Load training set, dev set and testing set
Here, we need to load the training set, the development set and the test set. For better classification results, we need to preprocess tweets before sending them to the classifiers.

In [None]:
# Load training set, dev set and testing set
data = {}
tweetids = {}
tweetgts = {}
tweets = {}
prediction_dict = {}
training_file = 'twitter-training-data.txt'
dev_file = 'twitter-dev-data.txt'

# format
# tweetids = {'training':[id1,id2]}
# tweetgts = {'training':[gts1,gts2]}
  
for dataset in [training_file] + [dev_file] + testsets:
    data[dataset] = []
    tweets[dataset] = []
    tweetids[dataset] = []
    tweetgts[dataset] = []
    location = join('semeval-tweets', dataset)
    # write code to read in the datasets here
    with open(location, encoding="utf8") as d:
        for line in d:
            data_ = line.split('\t') #each line data
            tweetid = data_[0] #retriving tweet ids
            gts = data_[1] #retriving sentiment of the tweet
            tweet = preprocess(data_[2].lower()) #retriving tweet content and preprocessing it
            #appending all the items for each of training, validation and test dataset
            data[dataset].append(line)  
            tweets[dataset].append(tweet)
            tweetids[dataset].append(tweetid)
            tweetgts[dataset].append(gts)




In [None]:
print(tweetgts[dataset][:20])
tweets[training_file][:5]

['neutral', 'negative', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'negative', 'positive']


['felt privilege play foo fighter song guitar today one plectrum gig saturday',
 'pakistan may islamic country der lot true muslim india love country sacrifice',
 'happy birthday cool golfer bali may become cooler cooler everyday stay humble little sister xx',
 'tmills go tucson 29th thursday',
 'hmmmmm blacklivesmatter matter like rise kid disgrace']

Feature Extraction using Bag of Words and TF-IDF 

Bag of Words: It represents each tweet as a vector of word frequencies. But is does not capture the context and meaning of words

Term Frequency-Inverse Document Frequency: It represents each tweet as a weighted word frequency vector. These weights reflects the significance of words in the tweet and corpus of documents. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def feature_extraction(type, train, test, datatype):
    types = {'bow': CountVectorizer(), 'tf-idf': TfidfVectorizer()}
    vectorizer = types.get(type)
    X_train = vectorizer.fit_transform(train)
    X_test = vectorizer.transform(test)
    return X_train,X_test

Code for LSTM

Creating a vocabulary to get all the unique words in the data and creating a word index with words as keys and unique integers as values for reference

In [None]:
from collections import Counter

# vocabulary creation
def get_vocab(tweets):
    tokenized_tweets = [tweet.split() for tweet in tweets]
    word_counts = Counter([word for tweet in tokenized_tweets for word in tweet])
    vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    return vocab

# creating word index dictionary
def get_word_index(vocab):
    word_idx = {'pad':0,'<unk>':1}
    for i, word in enumerate(vocab):
        word_idx[word] = i + 2 # we add 1 to reserve 0 for padding
    return word_idx

vocab = get_vocab(tweets[training_file])
word_index = get_word_index(vocab)


Reading glove data and creating a dictionary with words as keys and vectors as values

In [None]:
from torch.utils.data import DataLoader
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Loading GloVe embeddings
glove={}
location = join('glove.6B.100d.txt')
with open(location, encoding="utf8") as d:
    for line in d:
        data_ = line.split()
        word = str(data_[0])
        vector = np.asarray(data_[1:], dtype='float32') 
        glove[word] = vector


Creating an embedding matrix by mapping every word in word_index dictionary to corresponding array in glove vector

In [None]:
max_words = 5000
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim)) 
for token, i in word_index.items():
    if i == 0:
        continue
    if i<max_words:
        if token in glove:
            embedding_matrix[i] = glove[token]
        else:
            embedding_matrix[i] = np.random.randn(100) #this
    else:
        break
embedding_matrix.shape

(5000, 100)

Dataset class to contain the datasets, tokenise tweets and convert it into vector of unique values by mapping words to word_index and labels to (0,1,2)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F
from tqdm import tqdm
import torch.utils.data as Data

max_words = 5000
batch_size = 32
maxlen = 30 # maximum length of each sequence
idx_labels = {2:'positive', 0:'negative', 1:'neutral'}
class MyDataset(Data.Dataset):

    def __init__(self, features, classes):
        self.features = features
        self.classes = classes
        self.class_mapping = {'positive':0, 'negative': 1, 'neutral':2}

    def __len__(self):
        return self.features.__len__()
    
    def __getitem__(self, idx):
        tokens = nltk.word_tokenize(self.features[idx])
        tweet_indices = [word_index[word] if word in word_index and word_index[word]<5000 else word_index['<unk>'] for word in tokens][:maxlen]
        tweet_indices += [0] * (maxlen - len(tweet_indices)) # pad with 0's if sequence is shorter than maxlen
        labels = [self.class_mapping[i] for i in self.classes]
        tweet_indices = torch.tensor(tweet_indices, dtype = torch.long)
        st_classes = torch.tensor(labels[idx], dtype = torch.long)
        return tweet_indices, st_classes
       

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    # -- Disable the gradient --
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            # optimizer.zero_grad()

            preds = model(X)
            loss  = loss_fn(preds, Y)
            losses.append(loss.item())
            # loss.backward()
            # optimizer.step()
            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds    = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))

LSTM Classifier Architecture

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F
from tqdm import tqdm

# Defining the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, max_words, embedding_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(max_words, embedding_dim) #5000*100
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix)) #using pretrained embedding vector
        self.embedding.weight.requires_grad = False  # freeze the embedding layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True,) #100*128
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(hidden_dim, num_classes) #128*3

    def forward(self, x):
        embeds = self.embedding(x) #(20, 40, 100)
        lstm_out, _ = self.lstm(embeds.float()) #[20, 128] for lstm_out[:, -1, :]
        out = self.fc(lstm_out[:, -1, :])# [20, 1]
        return (out)



In [None]:
# from sklearn.model_selection import GridSearchCV

# Set hyperparameters
MAX_WORDS = 5000
EMBEDDING_DIM = 100
HIDDEN_SIZE = 256
NUM_CLASSES = 3
LEARNING_RATE = 1e-3
NUM_EPOCHS =10
WEIGHT_DECAY = 0.001

In [None]:

# Instantiate the model
model = LSTMClassifier(embedding_matrix, HIDDEN_SIZE, MAX_WORDS, EMBEDDING_DIM, NUM_CLASSES).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam( model.parameters(), lr=LEARNING_RATE)

In [None]:
def get_dataset(X, y, batch_size):
    
    def collate_fn(batch):
        tweets, classes = zip(*batch)
        tweets = nn.utils.rnn.pad_sequence(tweets, batch_first=True)
        classes = torch.tensor(classes, dtype=torch.long)
        return tweets, classes

    train_data = MyDataset(X,y)
    return DataLoader(train_data,batch_size,collate_fn=collate_fn)
    

In [None]:
batch_size = 32
def collate_fn(batch):
    tweets, classes = zip(*batch)
    tweets = nn.utils.rnn.pad_sequence(tweets, batch_first=True)
    classes = torch.tensor(classes, dtype=torch.long)
    return tweets, classes

train_data = MyDataset(tweets[training_file],tweetgts[training_file])
dev_data = MyDataset(tweets[dev_file],tweetgts[dev_file])
train_loader = DataLoader(train_data,batch_size,collate_fn=collate_fn)
dev_loader = DataLoader(dev_data,batch_size,collate_fn=collate_fn)

Defining Training Loop. Here, scheduler was used to gradually decrease the learning rate and find the best one out of them and after finding it out, it was commented out and the learning rate obtained(1e-3) is kept as a constant for our model

In [None]:

def training_loop(model, loss_fn, optimizer, train_loader, val_loader, epochs):
    for epoch in range(epochs):
        running_loss = []
        model.train()
        for i, (inputs, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(inputs)
            # print(outputs,labels)
            loss = loss_fn(outputs, labels).to(device)
            running_loss.append(loss.item())
            loss.backward()
            optimizer.step()
        # scheduler.step()
        print("Epoch ",epoch," - Train Loss : {:.3f}".format(torch.tensor(running_loss).mean()))
        # print("Scheduler learning rate {:.3f}".format(scheduler.get_last_lr()[0])) 
        # scheduler was used to gradually decrease the learning rate and find the best one out of them
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

Defining Prediction Code for LSTM

In [None]:
# Evaluation on the Test Set 
import gc
def predict(model, loader):
    Y_preds = []
    for X, Y in loader:
        preds = model(X)
        Y_preds.append(preds)
    gc.collect()
    Y_preds = torch.cat(Y_preds)
    return F.softmax(Y_preds, dim=-1).argmax(dim=-1).detach().numpy() # logits to prob distribution 



In [None]:
# training_loop(model, criterion, optimizer, train_loader, dev_loader, 1)

100%|██████████| 1410/1410 [06:13<00:00,  3.77it/s]


Epoch  0  - Train Loss : 0.371
Valid Loss : 1.140
Valid Acc  : 0.624


#### Build sentiment classifiers
We will be creating three different classifiers for this project. For each classifier, we choose between the bag-of-word features and the word-embedding-based features. Each classifier we will be evaluating over validation and test sets. 

In [None]:
# Buiding traditional sentiment classifiers  
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from torch.optim.lr_scheduler import StepLR

params = {'svm':{'bow':0.1,'tf-idf':1},'naive-bayes':{'bow':1,'tf-idf':0.1}}

for classifier in ['naive-bayes','svm', 'LSTM']:
    for features in ['bow', 'tf-idf', 'embedding']:
        # Skeleton: Creation and training of the classifiers
        if classifier == 'svm':
            if features == 'embedding':
                continue
            print('Training ' + classifier)
            svm_classifier = LinearSVC(C=params.get(classifier).get(features))
        elif classifier == 'naive-bayes' and features != 'embedding':
            if features == 'embedding':
                continue
            print('Training ' + classifier)
            nb = MultinomialNB(alpha=params.get(classifier).get(features))
        elif classifier == 'LSTM' :
            if features == 'bow' or features == 'tf-idf':
                continue
            # write the LSTM classifier here
            print('Training ' + classifier)
            # Instantiate the model
            model = LSTMClassifier(embedding_matrix, HIDDEN_SIZE, MAX_WORDS, EMBEDDING_DIM, NUM_CLASSES).to(device)
            # Loss function and optimizer
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam( model.parameters(), lr=LEARNING_RATE)
            # scheduler = StepLR(optimizer, step_size=1, gamma=0.1)
            train_loader = get_dataset(tweets[training_file],tweetgts[training_file],batch_size)
            data_loader = get_dataset(tweets[testset],tweetgts[testset],batch_size)
            training_loop(model, criterion, optimizer, train_loader, data_loader, NUM_EPOCHS)
            
        else:
            print('Unknown classifier name' + classifier)
            continue
        dev = ['twitter-dev-data.txt']
        # Predition performance of the classifiers
        for testset in testsets:
            id_preds = {}
            # write the prediction and evaluation code here
            if features != 'embedding':
                X_train, X_test = feature_extraction(features,tweets[training_file],tweets[testset],'test')
            if classifier == 'svm' and features != 'embedding':
                svm_classifier.fit(X_train, tweetgts[training_file])
                y_pred = svm_classifier.predict(X_test)
            elif classifier == 'naive-bayes' and features != 'embedding':
                nb.fit(X_train, tweetgts[training_file])
                y_pred = nb.predict(X_test)
            elif classifier == 'LSTM':
                if features == 'bow' or 'tf-idf':
                    continue
                else:
                    data_loader = get_dataset(tweets[testset],tweetgts[testset],batch_size)
                    y = predict(model, data_loader)
                    y_pred = [idx_labels[i] for i in y]
            for k,v in zip(tweetids[testset],y_pred):
                id_preds[k] = v
            
            testset_name = testset
            testset_path = join('semeval-tweets', testset_name)
            evaluate(id_preds, testset_path, features + '-' + classifier)
            confusion(id_preds, testset_path, features + '-' + classifier)

Training naive-bayes
semeval-tweets\twitter-dev-data.txt (bow-naive-bayes): 0.595
            positive  negative  neutral
positive    0.609     0.080     0.311     
negative    0.058     0.606     0.335     
neutral     0.227     0.141     0.632     

Training naive-bayes
semeval-tweets\twitter-dev-data.txt (tf-idf-naive-bayes): 0.542
            positive  negative  neutral
positive    0.610     0.081     0.309     
negative    0.063     0.656     0.281     
neutral     0.249     0.162     0.589     

Unknown classifier namenaive-bayes
Training svm
semeval-tweets\twitter-dev-data.txt (bow-svm): 0.596
            positive  negative  neutral
positive    0.677     0.054     0.269     
negative    0.073     0.629     0.297     
neutral     0.228     0.153     0.619     

Training svm
semeval-tweets\twitter-dev-data.txt (tf-idf-svm): 0.609
            positive  negative  neutral
positive    0.663     0.051     0.287     
negative    0.051     0.629     0.321     
neutral     0.239     0.144

100%|██████████| 1410/1410 [05:40<00:00,  4.14it/s]


Epoch  0  - Train Loss : 1.007
Valid Loss : 0.939
Valid Acc  : 0.519


100%|██████████| 1410/1410 [05:34<00:00,  4.22it/s]


Epoch  1  - Train Loss : 0.844
Valid Loss : 0.801
Valid Acc  : 0.627


100%|██████████| 1410/1410 [05:37<00:00,  4.18it/s]


Epoch  2  - Train Loss : 0.782
Valid Loss : 0.770
Valid Acc  : 0.653


100%|██████████| 1410/1410 [06:06<00:00,  3.85it/s]


Epoch  3  - Train Loss : 0.744
Valid Loss : 0.760
Valid Acc  : 0.659


100%|██████████| 1410/1410 [05:38<00:00,  4.17it/s]


Epoch  4  - Train Loss : 0.701
Valid Loss : 0.767
Valid Acc  : 0.659


100%|██████████| 1410/1410 [05:55<00:00,  3.97it/s]


Epoch  5  - Train Loss : 0.650
Valid Loss : 0.800
Valid Acc  : 0.641


100%|██████████| 1410/1410 [07:18<00:00,  3.22it/s]


Epoch  6  - Train Loss : 0.590
Valid Loss : 0.842
Valid Acc  : 0.647


100%|██████████| 1410/1410 [05:51<00:00,  4.01it/s]


Epoch  7  - Train Loss : 0.525
Valid Loss : 0.915
Valid Acc  : 0.627


100%|██████████| 1410/1410 [05:32<00:00,  4.24it/s]


Epoch  8  - Train Loss : 0.464
Valid Loss : 0.982
Valid Acc  : 0.620


100%|██████████| 1410/1410 [05:32<00:00,  4.24it/s]


Epoch  9  - Train Loss : 0.415
Valid Loss : 1.051
Valid Acc  : 0.618


In [None]:
data_loader = get_dataset(tweets[testset],tweetgts[testset],batch_size)
y = predict(model, data_loader)
y_pred = [idx_labels[i] for i in y]

semeval-tweets\twitter-dev-data.txt (tf-idf-LSTM): 0.000


Sample Code for Optimization

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import SVC
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.preprocessing import LabelEncoder

# # Tested hyper parameters - 'C': [0.1, 1, 10, 100, 1000], 
# # 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf', 'sigmoid','poly']}

# pipeline = Pipeline([
#     # ('tfidf', TfidfVectorizer()),
#     ('clf', SVC(kernel = 'poly'))
# ])

# param_grid = {
#     'clf__C': [0.1, 1, 10, 100, 1000], 
# 'clf__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'clf__kernel': ['rbf', 'sigmoid','poly']
#     # 'tfidf__max_df': [0.5, 0.75, 1.0],
#     # 'tfidf__max_df': [0.5, 0.75, 1.0],
#     # 'tfidf__ngram_range': [(1, 1), (1, 2)],
#     # 'nb__alpha': [0.001,0.0001,0.1, 1.0, 10.0],
#     # 'clf__C': [0.1, 1, 10, 100, 1000], 
# }
# scoring = 'f1_macro'
# # X_train,Xtest = feature_extraction('bow',tweets[training_file],tweets[testsets[0]],'train')
# label_encoder = LabelEncoder()
# y_train = label_encoder.fit_transform(tweetgts[training_file])
# # Initialize the GridSearchCV object
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring=scoring, cv=5)
# grid_search.fit(X_train, y_train)

# print("Best hyperparameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)