<a href="https://colab.research.google.com/github/Longtian0608/Sentiment-analysis-project/blob/main/Code_for_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Requirements:

In [None]:
pip install numpy pandas nltk

Imports:

In [None]:
import os
import string
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

#functions for tokenize

In [None]:
isinstance(string.punctuation, list) 

In [None]:
from nltk.stem.porter import PorterStemmer

#功能：提取line的核心文本，解决连词问题，匹配stopwords词库并忽略，最后还原词根
#dix是自创的dict，解决连词（e.g. I'll -> I will)
def tokenize(line, dix=None, remove_stopwords=False, stem=False):
    tokens = [re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9\']+", '', word).lower().strip() for word in line.split()] #替换任何非A-Za-z0-9, 特殊字符@,non-whitespace,https并将其小写拆解成list
    print(tokens)

    if dix:
        #tokens = [dix[word] if word in dix else word for word in tokens]
        temp = []
        for word in tokens:                 #遍历每个list里的word
            if word in dix:                 #若word在自创的dix里,拆解成对应的value再append到list
                sublist = dix[word].split() #(e.g. I'll -> I will)
                for element in sublist:
                    temp.append(element)
            else:                           
                temp.append(word)
        tokens = temp
    if remove_stopwords:                    #遇到含有stopwords里的词则忽略
        stoppers = stopwords.words('english')
        tokens = [word for word in tokens if word not in stoppers]

    if stem:                                #还原词根的功能 (e.g. likes-->like)
      ps = PorterStemmer()
      tokens = [ps.stem(word) for word in tokens]
    
    #remove punctuations
    #translator = str.maketrans(dict.fromkeys(string.punctuation))
    #tokens = [word.translate(translator) for word in tokens]
    return tokens

In [None]:
#test
line = "This is an argument with punctuations! and stopwords et stuff; But I'll get rid of these shits soon!! Will it work though?!"
dix = {"i'll": "i will"}
tokenize(line, dix, True, False)

In [None]:
#slandix是一个自己建立的dictionary，每个word对应一个word原型
#我理解的功能是找出每个word和word原型对应的index？每个index对应不同word的原型
def vocab_to_dict(vocab, slangdix=None):
    dix1 = {}
    dix2 = {}
    if slangdix is None:
        slangs = []
    else:
        slangs = slangdix.keys()
    i = 0
    for word in vocab:                         #遍历vocab每一个word
        if word not in dix1:                   
            if word in slangs:                 #判断word是否在dict集，如果在，找出word对应原型；如果不在，新建value并append相应index
                origin = slangdix[word]
                if origin in dix1:             #判断word原型是否出现过，如果有，index相同；如果没有，分别建立word和word原型并append相应index
                    dix1[word] = dix1[origin]  
                    
                else:                          
                    dix1[word] = i              
                    dix1[origin] = i
                    dix2[i] = origin
                    i += 1
            else:                             
                dix1[word] = i
                dix2[i] = word
                i += 1
    return dix1, dix2

In [None]:
#test
vocab = ['a', 'B', 'c', 'A', 'a', 'b', 'd']
slang = {'A':'a', 'B':'b', 'C':'c'}
(dix1, dix2) = vocab_to_dict(vocab, slang)
print(dix1)
print(dix2)

##onehot


In [None]:
#功能：根据上面的function得到的一个关于不同字符和其对应index的字典(dix1)，用来做one hot encoding, 将每个word转换成vector
#rows对应有几个words，columns对应字典里有几种values（以下的例子是4个words['a', 'b', 'A', 'c']，6个index{'a': 0, 'B': 1, 'b': 1, 'c': 2, 'A': 0, 'd': 3})
#(e.g. 'a'对应的index是0，因此他的vector就是[1,0,0,0,0,0])
def onehot(words, dix1, length=-1):
    if length<0:
      x = len(words) #number of words
    else:
      x = length                  #x对应rows：how many words
    values = tuple(dix1.values())
    y = len(values)               #y对应columns：how many values in dix1
    matrix = np.zeros((x, y)) #size of onehot matrix
    i = 0
    for word in words:
        index = dix1[word]
        matrix[i][index] = 1
        i += 1
    return matrix

In [None]:
#功能：通过vectors判断对应的字符 (e.g. [1,0,0,0,0,0]-> 'a')
def onehot_to_words(words, dix2):
    out = []
    for word in words:
        i, = np.where(word==1)
        out.append(dix2[i.item()])
    return out

In [None]:
wordw = ['a', 'b', 'A', 'c']
mat = onehot(wordw, dix1, 4)
mat

In [None]:
onehot_to_words(mat, dix2)

#word to vector function


In [None]:
data = pd.read_excel('/content/gdrive/MyDrive/slgCsv/practice.xlsx')

In [None]:
data['tokens'] = data['text'].apply(tokenize)

In [None]:
data['stem'] = data.apply(lambda row: tokenize(line=row['text'], remove_stopwords=True, stem=True), axis=1)

In [None]:
vocab = []
sentences = []
for row in data['stem']:
    sentences.append(list(row))
    for word in row:
        #print(row)
        if word not in vocab:
            vocab.append(word)
        else:
            print(word)

In [None]:
dix1, dix2 = vocab_to_dict(vocab)

In [None]:
len(dix1)

In [None]:
data['onehot'] = data.apply(lambda x: onehot(x['stem'], dix1), axis=1)

In [None]:
data['onehot'][0].shape