# Wiki articles Recommendation System

## 1. Load Data

In [376]:
from nltk import sent_tokenize, word_tokenize,RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import math
from kmeans import Kmeans
import gensim 
from gensim.models import Word2Vec 
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
import re


In [377]:
with open("wiki.raw", "r") as file:
    data = file.readlines()
    file.close()
data = data[0:500]
data = ' '.join(data)
len(data)

119505

In [378]:
# split by wikis
WIKIS = data.split(" \n  =")
# data = '\n'.join(WIKIS)
len(WIKIS)

84

In [379]:
len(data)

119505

## 2. Preprocessing

In [388]:
# tokenization
words = word_tokenize(data)
words[0:40]

['=',
 'Valkyria',
 'Chronicles',
 'III',
 '=',
 'Senjō',
 'no',
 'Valkyria',
 '3',
 ':',
 'Unrecorded',
 'Chronicles',
 '(',
 'Japanese',
 ':',
 '戦場のヴァルキュリア3',
 ',',
 'lit',
 '.',
 'Valkyria',
 'of',
 'the',
 'Battlefield',
 '3',
 ')',
 ',',
 'commonly',
 'referred',
 'to',
 'as',
 'Valkyria',
 'Chronicles',
 'III',
 'outside',
 'Japan',
 ',',
 'is',
 'a',
 'tactical',
 'role']

In [389]:
#stop words removal and non english words removal
stop_words = set(stopwords.words("english"))
clean_words = RegexpTokenizer(r'\w+').tokenize(data)
words_without_stop_words = [word for word in clean_words if word not in stop_words]

In [390]:
# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words_without_stop_words]
stemmed_words[:40]

['valkyria',
 'chronicl',
 'iii',
 'senjō',
 'valkyria',
 '3',
 'unrecord',
 'chronicl',
 'japanes',
 '戦場のヴァルキュリア3',
 'lit',
 'valkyria',
 'battlefield',
 '3',
 'commonli',
 'refer',
 'valkyria',
 'chronicl',
 'iii',
 'outsid',
 'japan',
 'tactic',
 'role',
 'play',
 'video',
 'game',
 'develop',
 'sega',
 'media',
 'vision',
 'playstat',
 'portabl',
 'releas',
 'januari',
 '2011',
 'japan',
 'third',
 'game',
 'valkyria',
 'seri']

In [391]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemitized_words = [lemmatizer.lemmatize(word) for word in words_without_stop_words]
lemitized_words[:40]

['Valkyria',
 'Chronicles',
 'III',
 'Senjō',
 'Valkyria',
 '3',
 'Unrecorded',
 'Chronicles',
 'Japanese',
 '戦場のヴァルキュリア3',
 'lit',
 'Valkyria',
 'Battlefield',
 '3',
 'commonly',
 'referred',
 'Valkyria',
 'Chronicles',
 'III',
 'outside',
 'Japan',
 'tactical',
 'role',
 'playing',
 'video',
 'game',
 'developed',
 'Sega',
 'Media',
 'Vision',
 'PlayStation',
 'Portable',
 'Released',
 'January',
 '2011',
 'Japan',
 'third',
 'game',
 'Valkyria',
 'series']

In [392]:
n_grams = [' '.join(grams) for grams in ngrams(lemitized_words, 2)]
# n_grams
tokens = []
for n in set(n_grams):
    if n_grams.count(n) > 2:
        tokens.append(n)
features = tokens
features

['Arkansas Military',
 'Old Rhymes',
 'Pope Leo',
 'There Got',
 'win streak',
 'behind spiracle',
 'N 92',
 'woman national',
 'tower built',
 'written collaboration',
 'James Wisniewski',
 'The Gambia',
 'state highway',
 'Edmonton Oilers',
 'N 71',
 'Christian themed',
 'St Edmund',
 'The team',
 'Gallian Army',
 'day later',
 'Senjō Valkyria',
 'football association',
 'Chronicles II',
 '2 record',
 'Jackets season',
 'C S',
 'polychaete worm',
 'The game',
 'team The',
 'Phoenix Coyotes',
 'Christmas card',
 '15 inch',
 'Book Hymns',
 'Extra Edition',
 'State authority',
 '6 4',
 'four game',
 'Flower Fairy',
 '1 000',
 'military unit',
 'The Tower',
 'northern Australia',
 '7 5',
 'Blue Jackets',
 'development The',
 'Captain Totten',
 'Edmund Pitlake',
 '2011 12',
 'soldier would',
 'PlayStation Portable',
 'He Leadeth',
 'The first',
 'track listing',
 'WYO 151',
 'trading Nash',
 'work Barker',
 'The Arkansas',
 'game He',
 'national team',
 'franchise record',
 'previous game

## 3. Create Vector Spaces and Data Matrix

In [393]:
WIKIS[0:3]

[' = Valkyria Chronicles III = \n  \n  Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n  The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for s

In [386]:
len(set(lemitized_words)), len(WIKIS), len(set(stemmed_words))

(4303, 84, 3360)

In [387]:
features = list(set(lemitized_words))
for word in features:
    if len(word) < 3:
        features.remove(word)

len(features)

4148

In [394]:
bag_of_words = np.zeros((len(WIKIS), len(features)))
wiki_words = []
wiki_disticit_words = []
for wiki in WIKIS :
    wiki = RegexpTokenizer(r'\w+').tokenize(wiki)
    wiki = [word for word in wiki if word not in stop_words]
    wiki =  [lemmatizer.lemmatize(word) for word in wiki]
    wiki = ' '.join(wiki)
    words = [' '.join(grams) for grams in ngrams(wiki.split(), 2)]
    wiki_words.append(words)
    words = set(words)
    wiki_disticit_words.append( words)
#     print(wiki)
# wiki
# words
wiki_disticit_words
# type(wiki.split())

[{'2010 carrying',
  '2011 Japan',
  '2014 Media',
  '3 Unrecorded',
  '3 commonly',
  'A large',
  'After release',
  'Azure Revolution',
  'Battlefield 3',
  'Calamaty Raven',
  'Character designer',
  'Chronicles II',
  'Chronicles III',
  'Chronicles Japanese',
  'Due low',
  'Employing fusion',
  'Europan War',
  'Gallia Second',
  'Hitoshi Sakimoto',
  'Honjou composer',
  'II Valkyria',
  'II While',
  'II director',
  'III Senjō',
  'III localized',
  'III outside',
  'Imperial unit',
  'It also',
  'It met',
  'January 2011',
  'Japan praised',
  'Japan tactical',
  'Japan third',
  'Japanese western',
  'Japanese 戦場のヴァルキュリア3',
  'May n',
  'Media Vision',
  'Nameless penal',
  'November year',
  'Ozawa A',
  'PlayStation 4',
  'PlayStation Portable',
  'Portable Released',
  'Raita Honjou',
  'Raven The',
  'Released January',
  'Revolution PlayStation',
  'Sakimoto returned',
  'Second Europan',
  'Sega Media',
  'Senjō Valkyria',
  'Takeshi Ozawa',
  'The game',
  'Unrecord

In [395]:
WIKIS

[' = Valkyria Chronicles III = \n  \n  Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n  The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for s

In [396]:
for i in range(len(wiki_words)):
    for word in wiki_words[i]:
        if word in features:
            feature = features.index(word)
            bag_of_words[i, feature] = 1
# len(WIKIS)


In [397]:
pd.DataFrame(bag_of_words, columns = features).head()

Unnamed: 0,Arkansas Military,Old Rhymes,Pope Leo,There Got,win streak,behind spiracle,N 92,woman national,tower built,written collaboration,...,4 km,Douglas MacArthur,year It,opening theme,The road,Governor Rector,previous Valkyria,season Blue,federal troop,Raphael Tuck
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [398]:
tf_idf = np.zeros((len(WIKIS), len(features)))
features_count = np.sum(bag_of_words, axis=0)
for i in range(len(wiki_words)) :
    for word in wiki_words[i]:
        if word in features:
            feature = features.index(word)
            tf_idf[i, feature] = 1 # (wiki_words.count(word) / len(wiki_words)) * (math.log(len(WIKIS) / features_count[i]))
#             print(i, feature, (wiki_words.count(word) / len(wiki_words)) * (math.log(len(WIKIS) / features_count[i])))
#             else:
#                 tf_idf[i, feature] = 0
print(i)
   

83


In [370]:
tf_idf.shape
np.sum(tf_idf, axis=0)

array([ 3.,  3.,  3.,  3.,  4.,  3.,  3.,  3.,  1.,  2.,  3.,  3.,  2.,
        1.,  1.,  4.,  3.,  3.,  3.,  2.,  3.,  4.,  3.,  2.,  2.,  2.,
        2.,  3.,  3.,  4.,  3.,  3.,  1.,  3.,  3.,  2.,  3.,  6.,  2.,
        2.,  3.,  6.,  1.,  3.,  3.,  5.,  3.,  2.,  4.,  5.,  3.,  1.,
        3.,  7.,  2.,  3.,  2.,  3.,  4.,  2.,  2.,  6.,  1.,  3.,  3.,
        7.,  3.,  3.,  2.,  1.,  1.,  3.,  3.,  1.,  2.,  4.,  3.,  1.,
        3.,  3.,  5.,  3.,  3.,  1.,  3.,  3.,  2.,  3.,  2.,  3.,  2.,
        3.,  4.,  3.,  2.,  2.,  2.,  2.,  4.,  5.,  4.,  3.,  3.,  4.,
        2.,  2.,  3.,  4.,  5.,  4.,  3.,  4.,  2.,  5.,  1.,  3.,  4.,
        3.,  3.,  1.,  2.,  2.,  5.,  2.,  3.,  2.,  5.,  5.,  1.,  3.,
        7.,  3.,  9.,  3.,  3.,  3.,  2.,  2.,  3.,  3.,  2.,  3.,  3.,
        2.,  2.,  2.,  2.,  2.,  6.,  1.,  4.,  3.,  2.,  3.,  2.,  3.,
        2.,  2.,  4.,  3.,  6.,  4.,  3.,  4.,  2.,  4.,  1.,  2.,  2.,
        3.,  3.,  3.,  2.,  2.,  3.,  2.,  3.,  2.,  3.,  2.,  2

In [399]:
tfidf = TfidfVectorizer(ngram_range=(1,2))
values = tfidf.fit_transform(WIKIS)
values.toarray()

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.0106274, 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [400]:
pd.DataFrame(tf_idf, columns = features).head()

Unnamed: 0,Arkansas Military,Old Rhymes,Pope Leo,There Got,win streak,behind spiracle,N 92,woman national,tower built,written collaboration,...,4 km,Douglas MacArthur,year It,opening theme,The road,Governor Rector,previous Valkyria,season Blue,federal troop,Raphael Tuck
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [401]:
wik = []
for w in WIKIS:
    wik.append(w.split())
word2vec = gensim.models.Word2Vec(wik, min_count = 1, size = 100, window = 5) 

In [402]:
word2vec.similarity("video","game")

  """Entry point for launching an IPython kernel.


0.99860096

In [403]:
# {0: 0, 1: 0, 2: 1, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 0, 9: 2, 10: 1, 11: 2}
title = []
for wiki in WIKIS:
    m = re.search(r'(\s*)(=*)(\s*)(=*)(\s*)(.*?) (=*)', wiki)
    title.append(m.group(6))
# WIKIS[9]
title

['Valkyria',
 'Gameplay',
 'Plot',
 'Development',
 'Music',
 'Release',
 'Reception',
 'Legacy',
 'Adaptations',
 'Tower',
 'Construction',
 'Civil',
 'Decommissioning',
 'Æsthetic',
 'Public',
 'Cicely',
 'Biography',
 'Early',
 'Art',
 'Flower',
 'The',
 'Middle',
 'Later',
 'Art',
 'Depictions',
 'Christian',
 'Works',
 'Cards',
 'Books',
 '=',
 'Book',
 'Religious',
 'Gambia',
 'The',
 'Background',
 'Plain',
 'Taxonomy',
 'Description',
 'Distribution',
 'Biology',
 'Human',
 '2011',
 'Off',
 'Regular',
 'October',
 'January',
 'March',
 'Post',
 'Standings',
 'Schedule',
 'Pre',
 'Regular',
 'Player',
 'Skaters',
 'Position',
 'Goals',
 'Points',
 'Goaltenders',
 'Games',
 'Wins',
 'Overtime',
 'Saves',
 'Milestones',
 'Transactions',
 'Gregorian',
 'Early',
 'Second',
 'Third',
 'Fourth',
 'Features',
 'There',
 'Background',
 'Composition',
 'Critical',
 'Music',
 'Track',
 'Charts',
 'Nebraska',
 'Route',
 'History',
 'Major',
 'USS',
 'Description',
 'As']

In [404]:
recommender = Kmeans(k=3)
recommender.fit(list(bag_of_words))
# recommender.predict(pd.DataFrame(data=word2vec))

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0

In [405]:
articles = enumerate(title)
articles = list(articles)
articles

[(0, 'Valkyria'),
 (1, 'Gameplay'),
 (2, 'Plot'),
 (3, 'Development'),
 (4, 'Music'),
 (5, 'Release'),
 (6, 'Reception'),
 (7, 'Legacy'),
 (8, 'Adaptations'),
 (9, 'Tower'),
 (10, 'Construction'),
 (11, 'Civil'),
 (12, 'Decommissioning'),
 (13, 'Æsthetic'),
 (14, 'Public'),
 (15, 'Cicely'),
 (16, 'Biography'),
 (17, 'Early'),
 (18, 'Art'),
 (19, 'Flower'),
 (20, 'The'),
 (21, 'Middle'),
 (22, 'Later'),
 (23, 'Art'),
 (24, 'Depictions'),
 (25, 'Christian'),
 (26, 'Works'),
 (27, 'Cards'),
 (28, 'Books'),
 (29, '='),
 (30, 'Book'),
 (31, 'Religious'),
 (32, 'Gambia'),
 (33, 'The'),
 (34, 'Background'),
 (35, 'Plain'),
 (36, 'Taxonomy'),
 (37, 'Description'),
 (38, 'Distribution'),
 (39, 'Biology'),
 (40, 'Human'),
 (41, '2011'),
 (42, 'Off'),
 (43, 'Regular'),
 (44, 'October'),
 (45, 'January'),
 (46, 'March'),
 (47, 'Post'),
 (48, 'Standings'),
 (49, 'Schedule'),
 (50, 'Pre'),
 (51, 'Regular'),
 (52, 'Player'),
 (53, 'Skaters'),
 (54, 'Position'),
 (55, 'Goals'),
 (56, 'Points'),
 (57, 

In [410]:
def prediction(article_name):
    pass

In [409]:
cluster = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0, 21: 0, 22: 0, 23: 0, 24: 0, 25: 0, 26: 0, 27: 0, 28: 0, 29: 0, 30: 0, 31: 0, 32: 0, 33: 0, 34: 0, 35: 0, 36: 0, 37: 0, 38: 0, 39: 0, 40: 0, 41: 0, 42: 0, 43: 0, 44: 0, 45: 0, 46: 0, 47: 0, 48: 0, 49: 0, 50: 0, 51: 0, 52: 0, 53: 0, 54: 0, 55: 0, 56: 0, 57: 0, 58: 0, 59: 0, 60: 0, 61: 0, 62: 0, 63: 0, 64: 0, 65: 0, 66: 0, 67: 0, 68: 0, 69: 0, 70: 0, 71: 0, 72: 0, 73: 0, 74: 0, 75: 0, 76: 0, 77: 0, 78: 0, 79: 0, 80: 0, 81: 0, 82: 0, 83: 0}new_cluster = [] #[[], [], []]
for k,v in clusters.items():
    new_cluster.append((v,articles[k][1] ))
    
new_cluster.sort()
new_cluster
# for i in range(3):
#     if 

[(0, 'Adaptations'),
 (0, 'Development'),
 (0, 'Gameplay'),
 (0, 'Music'),
 (0, 'Reception'),
 (0, 'Release'),
 (0, 'Valkyria'),
 (1, 'Construction'),
 (1, 'Legacy'),
 (1, 'Plot'),
 (2, 'Civil'),
 (2, 'Tower')]

In [145]:
bag_of_words[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [211]:
recommender.predict(pd.DataFrame(data=bag_of_words[0:5]))

{0: 0, 1: 0, 2: 1, 3: 0, 4: 0}

In [206]:
def bigram_prob(expected, given):
    
    if given == "*":
        given_prob = len(WIKIS)
        expected_prob = features_count[features.index(expected)] 
        return expected_prob / given_prob
    else:
        given_index = features.index(given)
        given_prob = features_count[given_index]
        expected_index = features.index(expected)
        expected_prob = 0
        
        for wiki_arr in bag_of_words:
            if wiki_arr[given_index] == 1 and wiki_arr[expected_index] == 1 :
                expected_prob += 1
                

In [208]:
bigram_prob("PlayStation Portable", "*")

0.25

# Arabic

In [212]:
 set(stopwords.words("arabic"))

{'آه',
 'آها',
 'آي',
 'أف',
 'أقل',
 'أكثر',
 'ألا',
 'أم',
 'أما',
 'أن',
 'أنا',
 'أنت',
 'أنتم',
 'أنتما',
 'أنتن',
 'أنى',
 'أو',
 'أولئك',
 'أولاء',
 'أوه',
 'أي',
 'أين',
 'أينما',
 'أيها',
 'إذ',
 'إذا',
 'إذما',
 'إذن',
 'إلا',
 'إلى',
 'إليك',
 'إليكم',
 'إليكما',
 'إليكن',
 'إما',
 'إن',
 'إنا',
 'إنما',
 'إنه',
 'إي',
 'إيه',
 'التي',
 'الذي',
 'الذين',
 'اللائي',
 'اللاتي',
 'اللتان',
 'اللتيا',
 'اللتين',
 'اللذان',
 'اللذين',
 'اللواتي',
 'بخ',
 'بس',
 'بعد',
 'بعض',
 'بك',
 'بكم',
 'بكما',
 'بكن',
 'بل',
 'بلى',
 'بما',
 'بماذا',
 'بمن',
 'بنا',
 'به',
 'بها',
 'بهم',
 'بهما',
 'بهن',
 'بي',
 'بيد',
 'بين',
 'تلك',
 'تلكم',
 'تلكما',
 'ته',
 'تي',
 'تين',
 'تينك',
 'ثم',
 'ثمة',
 'حاشا',
 'حبذا',
 'حتى',
 'حيث',
 'حيثما',
 'حين',
 'خلا',
 'دون',
 'ذا',
 'ذات',
 'ذاك',
 'ذان',
 'ذانك',
 'ذلك',
 'ذلكم',
 'ذلكما',
 'ذلكن',
 'ذه',
 'ذو',
 'ذوا',
 'ذواتا',
 'ذواتي',
 'ذي',
 'ذين',
 'ذينك',
 'ريث',
 'سوف',
 'سوى',
 'شتان',
 'عدا',
 'عسى',
 'عل',
 'على',
 'عليك',
 'عليه',
 'عم

In [411]:
ar_data = open("ar_wiki", "r").read()

In [412]:
words = ar_data.split()

In [413]:
ar_stop_words = set(stopwords.words("arabic"))

effective_word = [word for word in words if word not in ar_stop_words]

In [414]:
effective_word

['الرجل', 'اين', 'الشجرة', 'مقالة', 'جميلة']

In [415]:
ar_features = list(set(effective_word))
lines = ar_data.split("\n")
data_matrix = np.zeros((len(lines), len(ar_features)))

for i in range(len(lines)-1) :
        for word in lines[i].split():
            if word in ar_features:
                index = ar_features.index(word)
#                 print(i,index)
                data_matrix[i, index] = 1

In [416]:
pd.DataFrame(data_matrix, columns = ar_features).head()

Unnamed: 0,اين,جميلة,مقالة,الرجل,الشجرة
0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0


In [417]:
ar_recomend = Kmeans(k=2)
ar_recomend.fit(list(data_matrix))

[array([0., 0., 0., 0., 0.]), array([0., 0., 0., 1., 0.])]
[[0.25, 0.25, 0.25, 0.25, 0.25], [nan, nan, nan, nan, nan]]
{0: 0, 1: 0, 2: 0, 3: 0}
