In [1]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import re

In [3]:
text = """
    University of Ilorin (UNILORIN) is a federal government-owned public research university in Ilorin, Kwara State, Nigeria.[2][3] The university's main campus sits on an expansive area of land, about 5,000 hectares in the ancient city of Ilorin; making it the largest university in Nigeria and one of the largest in Africa by landmass. The university comprises 16 faculties and over 100 academic departments offering 103 programmes. It was established by a decree of the Federal Military Government of Nigeria in August,1975. The University of Ilorin has the highest enrollment of foreign students in Nigeria.[4][5] The establishment aimed to implement one of the educational directives of the Third National Development Plan, which was aimed at providing more opportunities for Nigerians aspiring to acquire university education and to generate high-level manpower, which is vital for the rapidly expanding economy. Compared[6] to other higher institutions of learning in the country, the institution has one of the largest land areas, covering approximately 15,000 hectares of land.[7][8] It is reported by Joint Admission Matriculation Board (JAMB) to be the most sought-after Nigerian university in 2021.[9] And again in 2023, and also in 2024, it was announced by the JAMB[10][11] Head, Professor Ishaq Oloyede to be the sought-after University, for the 2023 Unified Tertiary Matriculation Examinations (UTME), making it for the 10th consecutive year.
"""

In [13]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading punkt_tab: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [5]:
sentences = nltk.sent_tokenize(text)

print(sentences)

['\n    University of Ilorin (UNILORIN) is a federal government-owned public research university in Ilorin, Kwara State, Nigeria.', "[2][3] The university's main campus sits on an expansive area of land, about 5,000 hectares in the ancient city of Ilorin; making it the largest university in Nigeria and one of the largest in Africa by landmass.", 'The university comprises 16 faculties and over 100 academic departments offering 103 programmes.', 'It was established by a decree of the Federal Military Government of Nigeria in August,1975.', 'The University of Ilorin has the highest enrollment of foreign students in Nigeria.', '[4][5] The establishment aimed to implement one of the educational directives of the Third National Development Plan, which was aimed at providing more opportunities for Nigerians aspiring to acquire university education and to generate high-level manpower, which is vital for the rapidly expanding economy.', 'Compared[6] to other higher institutions of learning in t

In [6]:
corpus = []

for sentence in sentences:
    review = re.sub('[^a-zA-Z]', ' ', sentence)
    review = review.lower()
    corpus.append(review)

In [7]:
corpus

['     university of ilorin  unilorin  is a federal government owned public research university in ilorin  kwara state  nigeria ',
 '       the university s main campus sits on an expansive area of land  about       hectares in the ancient city of ilorin  making it the largest university in nigeria and one of the largest in africa by landmass ',
 'the university comprises    faculties and over     academic departments offering     programmes ',
 'it was established by a decree of the federal military government of nigeria in august      ',
 'the university of ilorin has the highest enrollment of foreign students in nigeria ',
 '       the establishment aimed to implement one of the educational directives of the third national development plan  which was aimed at providing more opportunities for nigerians aspiring to acquire university education and to generate high level manpower  which is vital for the rapidly expanding economy ',
 'compared    to other higher institutions of learning

In [14]:
#stopwords and stemming

lemmatize = WordNetLemmatizer()

processed_corpus = []

for sentence in corpus:
    words = nltk.word_tokenize(sentence)
    cleaned_words = [ 
        lemmatize.lemmatize(word) 
        for word in words 
        if word not in stopwords.words('english')
    ]
    processed_corpus.append(" ".join(cleaned_words))

In [15]:
processed_corpus

['university ilorin unilorin federal government owned public research university ilorin kwara state nigeria',
 'university main campus sits expansive area land hectare ancient city ilorin making largest university nigeria one largest africa landmass',
 'university comprises faculty academic department offering programme',
 'established decree federal military government nigeria august',
 'university ilorin highest enrollment foreign student nigeria',
 'establishment aimed implement one educational directive third national development plan aimed providing opportunity nigerian aspiring acquire university education generate high level manpower vital rapidly expanding economy',
 'compared higher institution learning country institution one largest land area covering approximately hectare land',
 'reported joint admission matriculation board jamb sought nigerian university',
 'also announced jamb head professor ishaq oloyede sought university unified tertiary matriculation examination utme 

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, ngram_range=(3, 3))

In [22]:
X = vectorizer.fit_transform(processed_corpus)

In [23]:
vectorizer.vocabulary_

{'university ilorin unilorin': 96,
 'ilorin unilorin federal': 45,
 'unilorin federal government': 91,
 'federal government owned': 31,
 'government owned public': 36,
 'owned public research': 76,
 'public research university': 80,
 'research university ilorin': 83,
 'university ilorin kwara': 95,
 'ilorin kwara state': 43,
 'kwara state nigeria': 53,
 'university main campus': 97,
 'main campus sits': 61,
 'campus sits expansive': 13,
 'sits expansive area': 84,
 'expansive area land': 29,
 'area land hectare': 10,
 'land hectare ancient': 55,
 'hectare ancient city': 38,
 'ancient city ilorin': 6,
 'city ilorin making': 14,
 'ilorin making largest': 44,
 'making largest university': 62,
 'largest university nigeria': 58,
 'university nigeria one': 98,
 'nigeria one largest': 69,
 'one largest africa': 73,
 'largest africa landmass': 56,
 'university comprises faculty': 92,
 'comprises faculty academic': 16,
 'faculty academic department': 30,
 'academic department offering': 0,
 'de

In [25]:
processed_corpus[0]

'university ilorin unilorin federal government owned public research university ilorin kwara state nigeria'

In [26]:
print(X.toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0
  0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0
  0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 

In [28]:
##TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
cv = TfidfVectorizer()
X2 = cv.fit_transform(processed_corpus)

In [32]:
processed_corpus[0]

'university ilorin unilorin federal government owned public research university ilorin kwara state nigeria'

In [34]:
print(X2.toarray())

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.25601687 0.
  0.         0.25601687 0.         0.         0.         0.
  0.         0.44519856 0.         0.         0.         0.
  0.         0.30311633 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.19667858 0.         0.         0.         0.
  0.         0.30311633 0.         0.         0.         0.
  0.30311633 0.         0.         0.30311633 0.         0.
  0.30311633 0.         0.         0.         0.         0.
  0.30311633 0.28416448 0.         0.         0.        ]
 [0.         0.         0.         0.24933485 0.         0.
  0.24933485 0.         0.         0.21059