In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
import nltk

# NLTK verilerini indir
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from textblob import Word, TextBlob
from wordcloud import WordCloud

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
text = """On January 3rd, 2023, Dr. Emily Watson, a senior data scientist at GreenAI Inc., gave a keynote speech at the International Conference on Artificial Intelligence in Paris, France. During her talk, she emphasized the importance of ethical AI and data privacy, citing recent cases of misuse in various industries.

She mentioned that over 3.2 million users were affected by a data breach last year, resulting in damages estimated at $12.5 million. Furthermore, she highlighted the role of open-source libraries, such as spaCy and NLTK, in democratizing access to natural language processing tools. According to her, students and researchers can now build high-quality NLP models without needing large financial resources.

"AI is not just about machines," she said, "it’s about how we interact with technology in a human-centered way." After the session, attendees from universities like Stanford, MIT, and Oxford approached her to discuss future collaboration opportunities.

At 5:45 PM, she posted a summary of her speech on Twitter, receiving over 8,000 likes and 1,200 retweets within a few hours. Her tweet included hashtags like #AIethics, #DataPrivacy, and #NLPtools.

The event concluded with a panel discussion moderated by Mr. John Lee, a journalist from TechWorld Weekly, who asked, “How can governments regulate AI without stifling innovation?”

"""

In [None]:
print(len(text))

In [None]:
df = pd.DataFrame({"raw_text": [text]})
print("🔹 Orijinal Metin:")
print(df["raw_text"][0])
df

In [None]:
#TOKENIZATION
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download("punkt_tab")

tokenize = df["raw_text"].iloc[0]

sent_tokenize(tokenize)

In [None]:
w_tokenize = word_tokenize(tokenize)
w_tokenize

In [None]:
len(w_tokenize)

In [None]:
df

In [None]:
#LOWERCASING
df["raw_text"] = df["raw_text"].str.lower()

df

In [None]:
#REMOVING PUNCTUATION
from string import punctuation

punctuation

In [None]:
df["raw_text"] = df["raw_text"].str.replace('[^\w\s]', '', regex=True)

df

In [None]:
#REMOVING STOPWORDS
 # Remove Stopwords (a, an, and, as, at, but, by, for, if, is, it, on, of, or, s, that, their, the, then, these ...)
from nltk.corpus import stopwords

# Download the 'stopwords' dataset
nltk.download('stopwords')

sw = stopwords.words('english')
sw

In [None]:
df["raw_text"] = df["raw_text"].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))

In [None]:
df

In [None]:
# tokenlaştırdıklarımızın içindeki stopwords ları filtreleyelim:

without_stopwords = []
for word in w_tokenize:
  if word not in stopwords.words("english"):
    without_stopwords.append(word)

without_stopwords

In [None]:
len(w_tokenize)

In [None]:
#REMOVING NUMBERS
df["raw_text"] = df["raw_text"].str.replace('\d', '', regex=True)

df


In [None]:
#STEMMING OR LEMMATIZATION
#stemming 
from nltk.stem import PorterStemmer

stm = PorterStemmer()

In [None]:
# stopwords ların kaldırıldığı metnin üzerinde kelimelerin köklerini alalım:

with_stem = [stm.stem(word) for word in without_stopwords]

# Stemming uygulandıktan sonra:
print("Stemmed Words:", with_stem)

In [None]:
len(with_stem)

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in without_stopwords]

In [None]:
#NAMED ENTITY RECOGNITION (NER)
import spacy

# SpaCy'nin İngilizce modelini yüklüyoruz
nlp = spacy.load("en_core_web_sm")

# Metni analiz et
doc = nlp(text)

# Varlıkları (Entities) bul ve yazdır
print(f"{'Entity':<30} {'Label':<15} {'Explanation'}")
print("-" * 60)
for ent in doc.ents:
    print(f"{ent.text:<30} {ent.label_:<15} {spacy.explain(ent.label_)}")

In [None]:
#PART-OF-SPEECH (POS) TAGGING  
#   cümlenin ögeleri
# * CC bağlaç
# * JJ sıfat
# * NN isim
# * RB zarf
# * VB fiil

nltk.download('averaged_perceptron_tagger_eng')

# random_context in tokenlere çevrilmiş halini alıyorum:
post = nltk.pos_tag(without_stopwords)
post

In [None]:
# POS etiketlerinin dağılımını görselleştiriyoruz
pos_tags = [tag for word, tag in post]
pos_counts = pd.Series(pos_tags).value_counts()
pos_counts.plot(kind='bar')

In [None]:
#WORD FREQUENCY COUNT
#TEXT VISUALIZATION
from sklearn.feature_extraction.text import CountVectorizer

count_v = CountVectorizer()
X = count_v.fit_transform(df["raw_text"])

# Sonuçları bir DataFrame'e çevirelim
word_counts = pd.DataFrame(X.toarray(), columns=count_v.get_feature_names_out())

print(word_counts.sum().sort_values(ascending=False))

In [None]:
# 'word_counts' içerisindeki kelimeleri ve toplam frekanslarını sıralayalım:

word_freq = word_counts.sum().sort_values(ascending=False).reset_index()

# DataFrame'e uygun sütun isimlerini veriyoruz
word_freq.columns = ['words', 'tf']
word_freq

In [None]:
# Barplot (sütun) grafik:

word_freq[word_freq["tf"] > 1].plot.bar(x="words", y="tf")
plt.show()

In [None]:
# Wordcloud for Context

text_Context = " ".join(i for i in df["raw_text"])

wordcloud = WordCloud().generate(text_Context)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()