**1. 드라이브 마운트**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**2. 데이터 불러오기**

In [None]:
import pandas as pd
import csv
imdb = pd.read_csv('/content/drive/MyDrive/Data/IMDBDataset.csv', error_bad_lines=False, engine='python')
imdb = imdb[:10000]

In [None]:
imdb.head()
imdb.info()
imdb.isnull().any()
imdb.count()
imdb['sentiment'].value_counts()

**3. 데이터 전처리**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('all')
nltk.download('popular')

In [6]:
def preprocessing(text):
    # 단어 토큰화
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] 
   
    # 불용어 제거
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    
    # 3 단어 이내 단어 제거
    tokens = [word for word in tokens if len(word) >= 3]
       
    # 소문자화
    tokens = [word.lower() for word in tokens]
    
    # 표제어 추출
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    preprocessed_text= ' '.join(tokens)
    return preprocessed_text

In [7]:
imdb['review_pre'] = imdb['review'].apply(lambda x: preprocessing(x))

In [None]:
imdb.head()

**4-1) train & test 데이터 분리**

In [9]:
import numpy as np

In [10]:
trainset_size = int(round((len(imdb)*0.7)))
trainset_size
x_train = np.array([el for el in imdb.iloc[:trainset_size,2]])
y_train = np.array([el for el in imdb.iloc[:trainset_size,0]])
x_test = np.array([el for el in imdb.iloc[trainset_size:,2]])
y_test = np.array([el for el in imdb.iloc[trainset_size:,0]])

**4-2) TF-IDF 벡터 생성**

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), \
		stop_words='english', strip_accents='unicode', norm='l2')
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)

In [13]:
X_train

<7000x84869 sparse matrix of type '<class 'numpy.float64'>'
	with 805610 stored elements in Compressed Sparse Row format>

**5-1) 분류기 생성I (NaiveBayesian)**

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
clf_NB = MultinomialNB().fit(X_train, y_train)
y_predicted_NB = clf_NB.predict(X_test)

In [None]:
print (' \n confusion_matrix (Naive Bayesian) \n ')
cm = confusion_matrix(y_test, y_predicted_NB)
print (cm)

print ('\n Classification Report:')
print (classification_report(y_test, y_predicted_NB))

**5-2) 분류기 생성II (DecisionTree)**

In [None]:
from sklearn import tree

In [None]:
clf_DT = tree.DecisionTreeClassifier().fit(X_train, y_train)
y_predicted_DT = clf_DT.predict(X_test)

In [None]:
print (' \n confusion_matrix (Decision Tree) \n ')
cm = confusion_matrix(y_test, y_predicted_DT)
print (cm)

print ('\n Classification Report:')
print (classification_report(y_test, y_predicted_DT))

**5-3) 분류기 생성III (SGD)**

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
clf_SGD = SGDClassifier(alpha=.0001).fit(X_train, y_train)
y_predicted_SGD = clf_SGD.predict(X_test)

In [None]:
print (' \n confusion_matrix (SGD) \n ')
cm = confusion_matrix(y_test, y_predicted_SGD)
print (cm)
print ('\n Classification Report:')
print (classification_report(y_test, y_predicted_SGD))

**5-4) 분류기 생성IV (SVM)**

In [None]:
from sklearn.svm import LinearSVC

In [None]:
clf_SVM = LinearSVC().fit(X_train, y_train)
y_predicted_SVM = clf_SVM.predict(X_test)

In [None]:
print (' \n confusion_matrix (SVM)\n ')
cm = confusion_matrix(y_test, y_predicted_SVM)
print (cm)
print ('\n Classification Report:')
print (classification_report(y_test, y_predicted_SVM))

**5-5) 분류기 생성V (RandomForest)**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf_RFA = RandomForestClassifier(n_estimators=10)
clf_RFA.fit(X_train, y_train)
y_predicted_RFA = clf_RFA.predict(X_test)

In [None]:
print (' \n confusion_matrix (RandomForest) \n ')
cm = confusion_matrix(y_test, y_predicted_RFA)
print (cm)
print ('\n Classification Report:')
print (classification_report(y_test, y_predicted_RFA))