In [None]:
import numpy as np
import pandas as pd
import re

def import_tweets(filename, header=None):
    tweet_dataset = pd.read_csv(filename, encoding='utf', header=header,engine='python',on_bad_lines = 'skip')
    tweet_dataset.columns = ['sentiment', 'id', 'date', 'flag', 'user', 'text']
    for i in ['flag', 'id', 'user', 'date']:
        del tweet_dataset[i]
    tweet_dataset.sentiment = tweet_dataset.sentiment.replace(4, 1)
    return tweet_dataset

def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet)
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet)
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet

def feature_extraction(data, method="tfidf"):
    if method == "tfidf":
        from sklearn.feature_extraction.text import TfidfVectorizer
        tfv = TfidfVectorizer(sublinear_tf=True, stop_words="english")
        features = tfv.fit_transform(data)
    elif method == "doc2vec":
        features = None
    else:
        return "Incorrect inputs"
    return features

def train_classifier(features, label, classifier="logistic_regression"):
    from sklearn.metrics import roc_auc_score, accuracy_score

    if classifier == "logistic_regression":
        from sklearn.linear_model import LogisticRegression
        model = LogisticRegression(C=1.)
    elif classifier == "naive_bayes":
        from sklearn.naive_bayes import MultinomialNB
        model = MultinomialNB()
    elif classifier == "svm":
        from sklearn.svm import SVC
        model = SVC(probability=True)
    else:
        print("Incorrect selection of classifier")
        return

    model.fit(features, label)
    probability_to_be_positive = model.predict_proba(features)[:, 1]
    predicted_labels = model.predict(features)
    accuracy = accuracy_score(label, predicted_labels)

    print("top 5 scores:", probability_to_be_positive[:5])
    print("accuracy (train data):", accuracy)

tweet_dataset = import_tweets("/content/dummy_tweets.csv")
tweet_dataset['text'] = tweet_dataset['text'].apply(preprocess_tweet)
data = np.array(tweet_dataset.text)
label = np.array(tweet_dataset.sentiment)
features = feature_extraction(data, method="tfidf")
train_classifier(features, label, "logistic_regression")


top 5 scores: [0.34342466 0.27346501 0.59303341 0.2951069  0.61087559]
accuracy (train data): 0.8571428571428571
