# 2023-03-06

# 자연어데이터의 표현

## One-Hot Vector

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [3]:
data = ['Kim', 'Lee', 'Choi', 'Park', 'Nam']

In [4]:
label_encoder = LabelEncoder()
integers = label_encoder.fit_transform(data)
integers

array([1, 2, 0, 4, 3], dtype=int64)

In [5]:
onehot_encoder = OneHotEncoder(sparse = False)
integers_array = integers.reshape(len(integers), 1)
integers_array.shape

(5, 1)

In [6]:
onehot_data = onehot_encoder.fit_transform(integers_array)
onehot_data



array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.]])

In [10]:
onehot_data.argmax(axis=1)

array([1, 2, 0, 4, 3], dtype=int64)

In [11]:
sample_data = onehot_data[0]

In [13]:
# inverse

import numpy as np

label_encoder.inverse_transform([np.argmax(sample_data)])

array(['Kim'], dtype='<U4')

## Count기반

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['''이봐 사람이 언제 죽는다고 생각하나?
심장이 총알에 뚫렸을 때? 아니야
불치의 병에 걸렸을 때? 아니야
맹독 버섯 스프를 마셨을 때? 아니야
사람들에게서 잊혀질 때다!''']
vector = CountVectorizer()

vector.fit_transform(corpus).toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1]],
      dtype=int64)

In [18]:
vector.vocabulary_

{'이봐': 15,
 '사람이': 9,
 '언제': 14,
 '죽는다고': 17,
 '생각하나': 10,
 '심장이': 12,
 '총알에': 18,
 '뚫렸을': 2,
 '아니야': 13,
 '불치의': 7,
 '병에': 6,
 '걸렸을': 0,
 '맹독': 4,
 '버섯': 5,
 '스프를': 11,
 '마셨을': 3,
 '사람들에게서': 8,
 '잊혀질': 16,
 '때다': 1}

In [19]:
# 가중치를 가지는 벡터 형태

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer().fit(corpus)
tfidf_vector.transform(corpus).toarray()

array([[0.19245009, 0.19245009, 0.19245009, 0.19245009, 0.19245009,
        0.19245009, 0.19245009, 0.19245009, 0.19245009, 0.19245009,
        0.19245009, 0.19245009, 0.19245009, 0.57735027, 0.19245009,
        0.19245009, 0.19245009, 0.19245009, 0.19245009]])

In [20]:
tfidf_vector.vocabulary_

{'이봐': 15,
 '사람이': 9,
 '언제': 14,
 '죽는다고': 17,
 '생각하나': 10,
 '심장이': 12,
 '총알에': 18,
 '뚫렸을': 2,
 '아니야': 13,
 '불치의': 7,
 '병에': 6,
 '걸렸을': 0,
 '맹독': 4,
 '버섯': 5,
 '스프를': 11,
 '마셨을': 3,
 '사람들에게서': 8,
 '잊혀질': 16,
 '때다': 1}

## Similarity(유사도)

### Cosine Similarity

In [21]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
x = np.array([1, 2, 3]).reshape(1, -1)
y = np.array([4, 5, 6]).reshape(1, -1)
z = np.array([-10, -20, -30]).reshape(1, -1)

In [29]:
cosine_similarity(x, y)    # 각도가 같을수록 두 데이터가 비슷하다고 판단

array([[0.97463185]])

In [30]:
cosine_similarity(y, z)

array([[-0.97463185]])

In [31]:
cosine_similarity(x, z)

array([[-1.]])

### Jaccard Similarity

In [32]:
# 집합 자료형

set1 = {'choi', 'jin', 'yeong'}
set2 = {'choi', 'jin', 'woong'}

intersection = set1.intersection(set2)      # set1&set2
print(intersection)

union = set1.union(set2)                    # set1|set2
print(union)

{'jin', 'choi'}
{'jin', 'woong', 'yeong', 'choi'}


In [33]:
jaccard_sim = len(intersection)/len(union)
print(jaccard_sim)

0.5


In [34]:
# 사이킷런 사용

from sklearn.metrics import jaccard_score

vec1 = [1, 1, 1, 0, 0]
vec2 = [1, 0, 1, 0, 0]
jaccard_score(vec1, vec2)

0.6666666666666666

## 줄거리 활용 추천시스템

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

movies = pd.read_csv('movies_metadata.csv')
movies.head()

  movies = pd.read_csv('movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [40]:
movies.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [41]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [43]:
movies.shape

(45466, 24)

In [44]:
movies = movies.head(20000)                    # 메모리 에러가 나는 경우

In [46]:
movies['overview'].isnull().sum()             # 결측치 확인

135

In [47]:
movies = movies.dropna(subset = ['overview'])   # NaN값 제거

In [48]:
movies['overview'].isnull().sum() 

0

In [49]:
# vector화

tfidf_vec = TfidfVectorizer(stop_words = 'english')          # stop_words -> 'the' 또는 조사를 없애는 
tfidf_matrix = tfidf_vec.fit_transform(movies['overview'])

In [50]:
tfidf_matrix.shape

(19865, 47487)

In [53]:
# vector화 전 

movies['overview'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [52]:
# vector화 후

tfidf_matrix.toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [54]:
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [55]:
cosine_sim_matrix.shape

(19865, 19865)

In [57]:
# 영화 제목명과 위 cosine_sim_matrix를 데이터프레임으로 합치기 위해

movies_titles = movies['original_title'].values
print(len(movies_titles))
movies_titles

19865


array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ..., 'Two in the Wave',
       'Lotte Reiniger: Homage to the Inventor of the Silhouette Film',
       "RKO Production 601: The Making of 'Kong, the Eighth Wonder of the World'"],
      dtype=object)

In [58]:
cos_sim_df = pd.DataFrame(cosine_sim_matrix, columns = movies_titles, index = movies_titles)
cos_sim_df.head()

Unnamed: 0,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Il ne faut jurer... de rien !,Calmos,How to Make Love to a Woman,"After Fall, Winter",Violeta se fue a los cielos,L'Ordre et la Morale,Versailles,Two in the Wave,Lotte Reiniger: Homage to the Inventor of the Silhouette Film,"RKO Production 601: The Making of 'Kong, the Eighth Wonder of the World'"
Toy Story,1.0,0.015752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.009968,0.060988,0.0,0.0,0.0,0.024569,0.0,0.0,0.0
Jumanji,0.015752,1.0,0.049069,0.0,0.0,0.051801,0.0,0.0,0.106296,0.0,...,0.029225,0.0,0.0,0.004263,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.049069,1.0,0.0,0.024997,0.0,0.0,0.006487,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,1.0,0.0,0.007122,0.0,0.009382,0.0,0.0,...,0.008295,0.0,0.008937,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.024997,0.0,1.0,0.0,0.032964,0.0,0.03272,0.0,...,0.009268,0.0,0.0,0.025405,0.0,0.0,0.078383,0.0,0.0,0.0


In [68]:
cos_sim_df['최종병기 활'].sort_values(ascending=False)

최종병기 활               1.000000
National Velvet      0.161298
파란대문                 0.148504
Symetria             0.134635
Delta                0.130526
                       ...   
A Christmas Carol    0.000000
Electric Dreams      0.000000
The Hit              0.000000
The Ice Pirates      0.000000
Trespassing          0.000000
Name: 최종병기 활, Length: 19865, dtype: float64

In [None]:
# 김종현 바보 멍청이 똥개 해삼 말미잘~~~~~~~~

## 자연어 데이터의 전처리 [Code]

### tensorflow (정수인코딩 ,패딩)

In [69]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [70]:
preprocessed_sentences = [['pop', 'jazz'],
                          ['pop', 'hip hop', 'jazz'],
                          ['pop', 'ratin', 'jazz'],
                          ['knew', 'R&B'],
                          ['R&B', 'classic', 'ratin', 'R&B'],
                          ['ratin', 'R&B'],
                          ['pop', 'classic', 'blues'],
                          ['pop', 'classic', 'blues'],
                          ['pop', 'classic', 'R&B'],
                          ['rock', 'rock', 'ratin', 'R&B',
                           'k-pop', 'pop', 'crazy'],
                          ['pop', 'j-pop', 'ratin', 'rap']]

In [73]:
# tokenizing

tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences)
encoded_text = tokenizer.texts_to_sequences(preprocessed_sentences)
encoded_text

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

In [74]:
# padding -> 가장 긴 길이에 맞춰 0을 채워줌 (default->앞쪽으로)

encoded_text_pad = pad_sequences(encoded_text)
encoded_text_pad

array([[ 0,  0,  0,  0,  0,  1,  5],
       [ 0,  0,  0,  0,  1,  8,  5],
       [ 0,  0,  0,  0,  1,  3,  5],
       [ 0,  0,  0,  0,  0,  9,  2],
       [ 0,  0,  0,  2,  4,  3,  2],
       [ 0,  0,  0,  0,  0,  3,  2],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  6],
       [ 0,  0,  0,  0,  1,  4,  2],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 0,  0,  0,  1, 12,  3, 13]])

In [75]:
encoded_text_pad = pad_sequences(encoded_text, padding = 'post')   # padding = 'post' -> 뒤쪽으로 채워줌
encoded_text_pad

array([[ 1,  5,  0,  0,  0,  0,  0],
       [ 1,  8,  5,  0,  0,  0,  0],
       [ 1,  3,  5,  0,  0,  0,  0],
       [ 9,  2,  0,  0,  0,  0,  0],
       [ 2,  4,  3,  2,  0,  0,  0],
       [ 3,  2,  0,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  6,  0,  0,  0,  0],
       [ 1,  4,  2,  0,  0,  0,  0],
       [ 7,  7,  3,  2, 10,  1, 11],
       [ 1, 12,  3, 13,  0,  0,  0]])

In [76]:
encoded_text_pad = pad_sequences(encoded_text, padding = 'post', maxlen = 5)      # maxlen = 5,  최대길이의 제한을 둠
encoded_text_pad

array([[ 1,  5,  0,  0,  0],
       [ 1,  8,  5,  0,  0],
       [ 1,  3,  5,  0,  0],
       [ 9,  2,  0,  0,  0],
       [ 2,  4,  3,  2,  0],
       [ 3,  2,  0,  0,  0],
       [ 1,  4,  6,  0,  0],
       [ 1,  4,  6,  0,  0],
       [ 1,  4,  2,  0,  0],
       [ 3,  2, 10,  1, 11],
       [ 1, 12,  3, 13,  0]])

### 영어

### 1. nltk

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GM220808\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.tokenize import word_tokenize

text = 'Think like a man of action and act like man of thought.'
print(word_tokenize(text))

['Think', 'like', 'a', 'man', 'of', 'action', 'and', 'act', 'like', 'man', 'of', 'thought', '.']


In [3]:
from nltk.tokenize import sent_tokenize

text = 'Courage is very important. Like a muscle, it is strengthened by use.'
print(sent_tokenize(text))

['Courage is very important.', 'Like a muscle, it is strengthened by use.']


### 2. spacy

In [4]:
!pip install spacy



In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 40.9 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import spacy

nlp = spacy.load('en_core_web_sm')
text = 'Think like a man of action and act like man of thought.'
doc = nlp(text)

tokenized_spacy = [token.text for token in doc]
tokenized_spacy

['Think',
 'like',
 'a',
 'man',
 'of',
 'action',
 'and',
 'act',
 'like',
 'man',
 'of',
 'thought',
 '.']

In [91]:
text = 'Courage is very important. Like a muscle, it is strengthened by use.'
doc = nlp(text)
tokenized_sent = [sent.text for sent in doc.sents]
tokenized_sent

['Courage is very important.', 'Like a muscle, it is strengthened by use.']

### 한국어

### 1. konlpy(okt)

In [7]:
!pip install konlpy



In [8]:
!pip install jpype1



In [9]:
import os
os.environ['JAVA_HOME'] = 'C:\\\\Program Files\\\\Java\\\\jdk1.8.0_251'

In [11]:
text = '문제는 지능적인 기계가 감정을 가질 수 있는가가 아니라, 기계가 감정 없이 지능적일 수 있는가이다.'

In [12]:
from konlpy.tag import Okt

okt = Okt()              # Open Korean Text
print(okt.morphs(text))   # 형태소로 나눈다.

['문제', '는', '지능', '적', '인', '기계', '가', '감정', '을', '가질', '수', '있는가가', '아니라', ',', '기계', '가', '감정', '없이', '지능', '적', '일', '수', '있는가이다', '.']


In [100]:
print(okt.morphs(text, stem = True))   # 형태소로 나눈다. -> 어간으로 변경

['문제', '는', '지능', '적', '인', '기계', '가', '감정', '을', '가지다', '수', '있다', '아니다', ',', '기계', '가', '감정', '없이', '지능', '적', '일', '수', '있다', '.']


In [102]:
print(okt.nouns(text))   # 명사 추출

['문제', '지능', '기계', '감정', '수', '기계', '감정', '지능', '일', '수']


In [103]:
print(okt.phrases(text))   # 어절 단위로 추출

['문제', '기계', '감정', '지능적일', '지능적일 수']


In [104]:
print(okt.pos(text))   # 품사 태깅

[('문제', 'Noun'), ('는', 'Josa'), ('지능', 'Noun'), ('적', 'Suffix'), ('인', 'Josa'), ('기계', 'Noun'), ('가', 'Josa'), ('감정', 'Noun'), ('을', 'Josa'), ('가질', 'Verb'), ('수', 'Noun'), ('있는가가', 'Adjective'), ('아니라', 'Adjective'), (',', 'Punctuation'), ('기계', 'Noun'), ('가', 'Josa'), ('감정', 'Noun'), ('없이', 'Adverb'), ('지능', 'Noun'), ('적', 'Suffix'), ('일', 'Noun'), ('수', 'Noun'), ('있는가이다', 'Adjective'), ('.', 'Punctuation')]


In [105]:
print(okt.pos(text, join=True))   # 품사 태깅2

['문제/Noun', '는/Josa', '지능/Noun', '적/Suffix', '인/Josa', '기계/Noun', '가/Josa', '감정/Noun', '을/Josa', '가질/Verb', '수/Noun', '있는가가/Adjective', '아니라/Adjective', ',/Punctuation', '기계/Noun', '가/Josa', '감정/Noun', '없이/Adverb', '지능/Noun', '적/Suffix', '일/Noun', '수/Noun', '있는가이다/Adjective', './Punctuation']


### 2.konply(Kkma)

In [108]:
from konlpy.tag import Kkma
kkma = Kkma()
print(kkma.morphs(text))        # 형태소로 나눈다.

['문제', '는', '지능', '적', '이', 'ㄴ', '기계', '가', '감정', '을', '가지', 'ㄹ', '수', '있', '는', '가가', '아니', '라', ',', '기계', '가', '감정', '없이', '지능', '적', '이', 'ㄹ', '수', '있', '는', '가이', '이', '다', '.']


In [109]:
print(kkma.nouns(text))         # 명사추출

['문제', '지능', '기계', '감정', '수', '가가', '가이']


In [110]:
print(kkma.pos(text))           # 품사태깅

[('문제', 'NNG'), ('는', 'JX'), ('지능', 'NNG'), ('적', 'XSN'), ('이', 'VCP'), ('ㄴ', 'ETD'), ('기계', 'NNG'), ('가', 'JKS'), ('감정', 'NNG'), ('을', 'JKO'), ('가지', 'VV'), ('ㄹ', 'ETD'), ('수', 'NNB'), ('있', 'VV'), ('는', 'ETD'), ('가가', 'NNG'), ('아니', 'VCN'), ('라', 'ECD'), (',', 'SP'), ('기계', 'NNG'), ('가', 'JKS'), ('감정', 'NNG'), ('없이', 'MAG'), ('지능', 'NNG'), ('적', 'XSN'), ('이', 'VCP'), ('ㄹ', 'ETD'), ('수', 'NNB'), ('있', 'VV'), ('는', 'ETD'), ('가이', 'NNG'), ('이', 'VCP'), ('다', 'EFN'), ('.', 'SF')]


In [111]:
print(kkma.pos(text, join=True))   # 품사태깅2

['문제/NNG', '는/JX', '지능/NNG', '적/XSN', '이/VCP', 'ㄴ/ETD', '기계/NNG', '가/JKS', '감정/NNG', '을/JKO', '가지/VV', 'ㄹ/ETD', '수/NNB', '있/VV', '는/ETD', '가가/NNG', '아니/VCN', '라/ECD', ',/SP', '기계/NNG', '가/JKS', '감정/NNG', '없이/MAG', '지능/NNG', '적/XSN', '이/VCP', 'ㄹ/ETD', '수/NNB', '있/VV', '는/ETD', '가이/NNG', '이/VCP', '다/EFN', './SF']


## 불용어(stopword) 제거

In [115]:
okt = Okt()
text = '완전한 인공지능의 개발은 인류의 종말을 의미할 수 있다.'
stopwords = ['의', '은', '을', '할', '수']

tokens = okt.morphs(text)
remove_stopwords = [token for token in tokens if not token in stopwords]      # 불용어를 제거한 token들
print('불용어 제거 전:', tokens)
print('불용어 제거 후:', remove_stopwords)

불용어 제거 전: ['완전한', '인공', '지능', '의', '개발', '은', '인류', '의', '종말', '을', '의미', '할', '수', '있다', '.']
불용어 제거 후: ['완전한', '인공', '지능', '개발', '인류', '종말', '의미', '있다', '.']


## 정규표현식

## 1. regex

### 2. regex(.)
 - ' . '의 의미는 어떤 임의의 문자열을 의미
 - abc, adc, azc, aqc -> regex(a.c)

In [120]:
import re
r = re.compile('a.c')            # a.c 형태의 문자열을 찾겠다는 조건 r
print(r.search('ccc'))
print(r.search('a1c'))

None
<re.Match object; span=(0, 3), match='a1c'>


In [123]:
r = re.compile('a..c')            # a..c 형태의 문자열을 찾겠다는 조건 r
print(r.search('ccc'))
print(r.search('a1xc'))

None
<re.Match object; span=(0, 4), match='a1xc'>


### 3. regex(?)
 - ? : 있을 수도 있고 없을 수 도 있을 때

In [126]:
r = re.compile('ab?c')            # b가 있을 수도 있고 없을 수도 있고
print(r.search('abc'))
print(r.search('ac'))
print(r.search('abzc'))

<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 2), match='ac'>
None


### 4. regex(*)
 - '*' 앞에 문자열이 0개 이상인 경우

In [128]:
r = re.compile('ab*c')            # b가 a와 c사이 0~무한개인 경우
print(r.search('ac'))
print(r.search('abc'))
print(r.search('abbbbbbbbbbbbbbbbbbbbbc'))

<re.Match object; span=(0, 2), match='ac'>
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 23), match='abbbbbbbbbbbbbbbbbbbbbc'>


### 5. regex(+)
 - '*'와 유사
 - 최소 1개 이상인 경우

In [129]:
r = re.compile('ab+c')
print(r.search('ac'))
print(r.search('abc'))
print(r.search('abbbbbbbbbbbbbbbbbc'))

None
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 19), match='abbbbbbbbbbbbbbbbbc'>


### 6. regex(^)
 - 시작하는 문자열

In [130]:
r = re.compile('^ab')
print(r.search('abcccccc'))
print(r.search('acb'))
print(r.search('abczcxcs'))

<re.Match object; span=(0, 2), match='ab'>
None
<re.Match object; span=(0, 2), match='ab'>


### 7. regex({숫자})
 - 문자열 앞에 숫자만큼 반복된 경우

In [131]:
r = re.compile('ab{2}c')     # a와 C사이에 b가 두번 등장하는 문자열
print(r.search('ac'))
print(r.search('abc'))
print(r.search('abbc'))
print(r.search('abbbbbc'))

None
None
<re.Match object; span=(0, 4), match='abbc'>
None


### 8. regex({숫자1, 숫자2})
 - 문자열 앞에 숫자1 ~ 숫자2만큼 반복된 경우
 - 숫자2를 생략하면 끝까지(무한대)

In [133]:
r = re.compile('ab{2,5}c')      # a와 c사이에 b가 2~5번 반복된 경우
print(r.search('abc'))
print(r.search('abbc'))
print(r.search('abbbbbc'))
print(r.search('abbbbbbbbbbbbc'))

None
<re.Match object; span=(0, 4), match='abbc'>
<re.Match object; span=(0, 7), match='abbbbbc'>
None


In [134]:
r = re.compile('ab{2,}c')      # a와 c사이에 b가 2번 이상 반복된 경우
print(r.search('abc'))
print(r.search('abbc'))
print(r.search('abbbbbc'))
print(r.search('abbbbbbbbbbbbc'))

None
<re.Match object; span=(0, 4), match='abbc'>
<re.Match object; span=(0, 7), match='abbbbbc'>
<re.Match object; span=(0, 14), match='abbbbbbbbbbbbc'>


### 9. regex([])
 - []안에 들어가는 문자와 매치
 - regex([a-zA-Z]) : 영어 전체를 의미
 - regex([0-9]) : 숫자 전체
 - regex([ㄱ-ㅎㅏ-ㅣ가-힣]) : 한글 전체

In [135]:
r = re.compile('[abcd]')            # [a-d]
print(r.search('zz'))
print(r.search('abcacaca'))
print(r.search('zzzzzzzzzaz'))

None
<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(9, 10), match='a'>


In [136]:
r = re.compile('[a-z]')            # [a-z] 알파벳 소문자 전체
print(r.search('AAA'))
print(r.search('1111'))
print(r.search('aBCCdD'))

None
None
<re.Match object; span=(0, 1), match='a'>


### 10. regex([^문자열])
 - 문자열을 제외한 모든 문자

In [139]:
r = re.compile('[^abcd]')            # 'abcd'를 제외한 모든 문자
print(r.search('a'))
print(r.search('bc'))
print(r.search('bczzzz'))
print(r.search('11111'))
print(r.search('ABCD'))

None
None
<re.Match object; span=(2, 3), match='z'>
<re.Match object; span=(0, 1), match='1'>
<re.Match object; span=(0, 1), match='A'>


## regex module

### 1. re.match() vs re.search()
 - math : 문자열 첫 부분부터 정규표현식이 매치하는지를 확인
 - search : 전체를 확인

In [145]:
r = re.compile('abc.')
print(r.match('zzzabcd'))
print(r.search('zzzabcd'))

None
<re.Match object; span=(3, 7), match='abcd'>


### 2. re.split()

In [146]:
text = '남 성 희'
re.split(' ', text)

['남', '성', '희']

In [149]:
text = '''이름
전화번호
성별
나이'''
re.split('\n', text)

['이름', '전화번호', '성별', '나이']