In [23]:
import nltk
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# If not previously performed:
# nltk.download('stopwords')

stemming = PorterStemmer()
stops = set(stopwords.words("english"))

def apply_cleaning_function_to_list(X):
    cleaned_X = []
    for element in X:
        cleaned_X.append(clean_text(element))
    return cleaned_X


def clean_text(raw_text):
    """This function works on a raw text string, and:
        1) changes to lower case
        2) tokenizes (breaks down into words
        3) removes punctuation and non-word text
        4) finds word stems
        5) removes stop words
        6) rejoins meaningful stem words"""
    
    #Convert to lower case Doesn't work because one column is numeric 
    #text = raw_text.lower()
    
    # Tokenize
    tokens = nltk.word_tokenize(raw_text)
    
    # Keep only words (removes punctuation + numbers)
    # use .isalnum to keep also numbers
    token_words = [w for w in tokens if w.isalpha()]
    
    # Stemming
    stemmed_words = [stemming.stem(w) for w in token_words]
    
    # Remove stop words
    meaningful_words = [w for w in stemmed_words if not w in stops]
    
    # Rejoin meaningful stemmed words
    joined_words = ( " ".join(meaningful_words))
    
    # Return cleaned data
    return joined_words


### APPLY FUNCTIONS TO EXAMPLE DATA

# Load data example
imdb = pd.read_excel("assignment.xlsx")


# Truncate data for example
imdb = imdb.head(100)

# Get text to clean
text_to_clean = list(imdb['Question'])

# Clean text
cleaned_text = apply_cleaning_function_to_list(text_to_clean)

# Show first example
print ('Original text:',text_to_clean[0])
print ('\nCleaned text:', cleaned_text[0])

# Add cleaned data back into DataFrame
imdb['cleaned_review'] = cleaned_text
imdb

Original text: is this what they intended? don't they really want $(TARGET_OUT_DATA_NATIVE_TEST)?


Cleaned text: thi intend realli want


Unnamed: 0,inline-comment-id,# Comment,Question,Final Label,cleaned_review
0,84326dd1_566c7146,1,is this what they intended? don't they really ...,request for confirmation,thi intend realli want
1,84326dd1_566c7146,2,is this what they intended? don't they really ...,surprise,thi intend realli want
2,99d1f8e4_92b31cea,3,Don't we need to increment 'i' in the else cas...,suggestion,Do need increment els case avoid infinit loop
3,193d089f_f5fac752,4,i can't see anywhere where this is set to fals...,suggestion,ca see anywher thi set fals adjust singl refer...
4,50c2f81e_ac4fd6fc,5,are you sure you want to include this source f...,criticism,sure want includ thi sourc file directli whi c...
...,...,...,...,...,...
95,64367de0_d87fa584,96,CONFIG_SND_RAWMIDI was in previous version.\nW...,request for clarification,wa previou version wa correct remov
96,a46156bd_1a1a193a,97,What would the right solution look like after ...,request for clarification,would right solut look like refactor
97,79fd56bf_7fd12f55,98,"What does this comment tell us, that the code ...",criticism,doe thi comment tell us code doe
98,b988aea5_1f8923b9,99,Which exceptions do you need to catch here? Is...,request for information,except need catch Is securityexcept We usual e...


In [2]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

for column_name in imdb.columns:
    
    if imdb[column_name].dtype == object:
        imdb[column_name] = le.fit_transform(imdb[column_name])
        
    else:
        pass
            


In [3]:
X = imdb['cleaned_review']
y = imdb['Final Label']

In [4]:
  ##################### Task 1 ###################################
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train.values.reshape(-1,1), y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
clf.predict_proba(y_test.values.reshape(-1,1))

array([[0.03190702, 0.01429061, 0.02038781, 0.01918142, 0.02405667,
        0.29234541, 0.13009821, 0.01601694, 0.06143285, 0.02855263,
        0.35445155, 0.00727888],
       [0.03130041, 0.01394426, 0.01975174, 0.0186856 , 0.02299469,
        0.29289388, 0.13158586, 0.01601862, 0.06107237, 0.02968074,
        0.35510737, 0.00696446],
       [0.03251949, 0.01464292, 0.02104055, 0.01968682, 0.02516314,
        0.29174507, 0.12860406, 0.01601237, 0.06178425, 0.02746242,
        0.3537328 , 0.00760611],
       [0.03190702, 0.01429061, 0.02038781, 0.01918142, 0.02405667,
        0.29234541, 0.13009821, 0.01601694, 0.06143285, 0.02855263,
        0.35445155, 0.00727888],
       [0.03502387, 0.01611112, 0.02382285, 0.02180448, 0.03006576,
        0.28881723, 0.12256847, 0.01596429, 0.06309239, 0.02345849,
        0.35021902, 0.00905203],
       [0.03010547, 0.01326942, 0.0185286 , 0.01772261, 0.02099807,
        0.29383674, 0.13454041, 0.0160134 , 0.06032548, 0.03205529,
        0.35623212,

## TASK 1 ACCURACY

In [7]:
print("Task 1 Accuracy")
clf.score(X_test.values.reshape(-1,1), y_test)

Task 1 Accuracy


0.4

In [8]:
################################# Task 2 #####################################
X = imdb[['Question','inline-comment-id']]
y= imdb['Final Label']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## TASK 2 ACCURACY

In [9]:
print("Task 2 Accuracy")
clf.score(X_test, y_test)

Task 2 Accuracy


0.55

In [10]:
X_train.shape


(80, 2)

In [11]:
y_train.shape

(80,)

In [12]:
############## Task 3 #######################333
X = imdb[['cleaned_review','inline-comment-id']]
y = imdb['Final Label']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## TASK 3 ACCURACY

In [13]:
print("Task 3 Accuracy")
clf.score(X_test, y_test)

Task 3 Accuracy


0.1

In [14]:
########################### Task 4 ################################
import pip
pip.main(['install','imbalanced-learn'])

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.




0

In [15]:
#y_train.value_counts()

In [16]:
##########  Task 4 ######
from imblearn.over_sampling import RandomOverSampler

X = imdb[['cleaned_review','inline-comment-id']]
y = imdb['Final Label']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4)

ros = RandomOverSampler(random_state=10)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

lr2 = LogisticRegression() 
lr2.fit(X_resampled, y_resampled.ravel()) 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

## TASK 4 ACCURACY

In [17]:
print("Task 4 Accuracy")
clf.score( X_test,y_test)

Task 4 Accuracy


0.325

In [18]:
X_resampled.shape

(165, 2)

In [19]:
X_train.shape

(60, 2)

In [20]:
y_resampled.shape

(165,)

In [21]:
y_train.shape

(60,)