In [3]:
# read the data into a pandas dataframe
import pandas as pd
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfneg = data2df('MoviePosNeg/neg/', 0) # NEG
dfpos = data2df('MoviePosNeg/pos/', 1) # POS

df = pd.concat([dfpos, dfneg], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
437,cv437_24070.txt,* * * warning - the following review contains...,0
210,cv210_9312.txt,i must say from the outset that i have never b...,1
809,cv809_5009.txt,driving miss daisy takes its sweet time to tel...,1
302,cv302_26481.txt,"the tagline for this film is : "" some houses a...",0
36,cv036_16831.txt,dora ( fernanda montenegro ) sits behind a mak...,1
8,cv008_29435.txt,"after bloody clashes and independence won , lu...",1
542,cv542_18980.txt,don't let the following quirks of this review ...,1
427,cv427_11693.txt,""" mandingo "" has traditionally been seen as o...",0
448,cv448_14695.txt,a wonderful little movie that is really intere...,1
132,cv132_5618.txt,quaid stars as a man who has taken up the prof...,1


In [5]:
# setup the data
X, y = df['text'], df['class']
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [15]:
clf = Pipeline(steps=[('pp',TfidfVectorizer()),('mdl',MultinomialNB())])

In [16]:
from sklearn.model_selection import GridSearchCV

In [27]:
param_grid = {
    'mdl__alpha': [0.01,0.1,0.2,0.5,1]
    }
gscv = GridSearchCV(clf, param_grid, cv=5, return_train_score=False) #idd=False default

In [28]:
gscv.fit(Xtrain, ytrain)


0.8225
[[171  34]
 [ 37 158]]
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       205
           1       0.82      0.81      0.82       195

   micro avg       0.82      0.82      0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [29]:
ypred = gscv.best_estimator_.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.8225
[[171  34]
 [ 37 158]]
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       205
           1       0.82      0.81      0.82       195

   micro avg       0.82      0.82      0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [30]:
print(gscv.cv_results_, "\n")

{'mean_fit_time': array([0.54137511, 0.54076142, 0.54476461, 0.54854522, 0.53737469]), 'std_fit_time': array([0.01016184, 0.0026377 , 0.0022085 , 0.0071172 , 0.00843817]), 'mean_score_time': array([0.12326274, 0.12665534, 0.12584329, 0.13123779, 0.13083925]), 'std_score_time': array([0.00375717, 0.00167925, 0.0032919 , 0.00420161, 0.00802738]), 'param_mdl__alpha': masked_array(data=[0.01, 0.1, 0.2, 0.5, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'mdl__alpha': 0.01}, {'mdl__alpha': 0.1}, {'mdl__alpha': 0.2}, {'mdl__alpha': 0.5}, {'mdl__alpha': 1}], 'split0_test_score': array([0.78125 , 0.8     , 0.815625, 0.828125, 0.821875]), 'split1_test_score': array([0.78125 , 0.8     , 0.809375, 0.821875, 0.821875]), 'split2_test_score': array([0.78125 , 0.7875  , 0.790625, 0.784375, 0.79375 ]), 'split3_test_score': array([0.775   , 0.8125  , 0.815625, 0.815625, 0.8125  ]), 'split4_test_score': array([0.7625  , 0.784375, 

In [1]:
import pandas as pd

In [3]:
df=pd.DataFrame([['Angie Fiona Forest',40],['Abe Steven Johnson',60],['Mary Janice Carson',60],['May Stefan Clemson',60],['Charline Jobs James',80]], columns=['Name','Age'])

In [4]:
df.head()

Unnamed: 0,Name,Age
0,Angie Fiona Forest,40
1,Abe Steven Johnson,60
2,Mary Janice Carson,60
3,May Stefan Clemson,60
4,Charline Jobs James,80


In [14]:
temp= [i.split()[0] for i in df['Name']]
temp


['Angie', 'Abe', 'Mary', 'May', 'Charline']

In [16]:
df['FNFL']=[i[0] for i in temp]
df

Unnamed: 0,Name,Age,FNFL
0,Angie Fiona Forest,40,A
1,Abe Steven Johnson,60,A
2,Mary Janice Carson,60,M
3,May Stefan Clemson,60,M
4,Charline Jobs James,80,C


In [17]:
df.groupby('FNFL').sum()['Age']

FNFL
A    100
C     80
M    120
Name: Age, dtype: int64